diff --git a/main.py b/main.py index 6f2cd14..4ac6884 100644 --- a/main.py +++ b/main.py @@ -6,23 +6,27 @@ from collections import Counter from bs4 import BeautifulSoup if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Dig out links from a website.") - parser.add_argument('site', type=str, help="Website that you want to scrape for links.") + parser = argparse.ArgumentParser( + description="Dig out links from a website.") + parser.add_argument( + 'site', type=str, help="Website that you want to scrape for links.") parser.add_argument('-f', '--filter', type=str, default=r'..*$', help="Only return file names matching this regular expression.") - parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.") - parser.add_argument('-d', '--debug', action='store_true', help="Print out debugging information.") + parser.add_argument('-s', '--summary', action='store_true', + help="Print summary and quit.") + parser.add_argument('-d', '--debug', action='store_true', + help="Print out debugging information.") parser.add_argument('-r', '--save-report', action='store_true', help='Save report of all the original links to a file.') - + args = parser.parse_args() - + # todo: save in tmp h = httplib2.Http('.cache') response, content = h.request(args.site) - + s = BeautifulSoup(content, features='html.parser') - + # if args.debug: # print(s.prettify()) @@ -40,7 +44,8 @@ if __name__ == '__main__': has_jpeg = True extensions.append(extension) if args.debug: - print(f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}') + print( + f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}') # todo: merge jpeg and jpg into one extension (display as: xxx jpg (yyy as jpeg)) print(f'URL: {args.site}') print(f'Total file names: {len(links)}') @@ -48,15 +53,15 @@ if __name__ == '__main__': # todo: in the counts of extensions mention varying capitalisation extension_count = Counter(extensions) if has_jpeg: - - for extension, count in extension_count.items(): - # do not print the first character (dot) of extension - if not has_jpeg: - print(f'{count}\t{extension[1:]}') - else: - if extension == '.jpg' or extension == '.jpeg': - print(f"{count}\t{extension[1:]} (with some as .jpeg") - exit(0) + + for extension, count in extension_count.items(): + # do not print the first character (dot) of extension + if not has_jpeg: + print(f'{count}\t{extension[1:]}') + else: + if extension == '.jpg' or extension == '.jpeg': + print(f"{count}\t{extension[1:]} (with some as .jpeg") + exit(0) # we got to the end, print out all the links link_list = [f'{args.site}{link["href"]}' for link in links] for link in link_list: