From 1ffe83e493dcd888c61479ba850222697b1d0511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Kucharczyk?= Date: Wed, 29 Jul 2020 22:21:53 +0200 Subject: [PATCH] Save unfinished work. --- main.py | 56 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/main.py b/main.py index 59da35e..6f2cd14 100644 --- a/main.py +++ b/main.py @@ -8,28 +8,60 @@ from bs4 import BeautifulSoup if __name__ == '__main__': parser = argparse.ArgumentParser(description="Dig out links from a website.") parser.add_argument('site', type=str, help="Website that you want to scrape for links.") - parser.add_argument('-f', '--filter', type=str, default='\..*$', help="Only return filenames matching this regular expression.") + parser.add_argument('-f', '--filter', type=str, default=r'..*$', + help="Only return file names matching this regular expression.") parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.") + parser.add_argument('-d', '--debug', action='store_true', help="Print out debugging information.") + parser.add_argument('-r', '--save-report', action='store_true', + help='Save report of all the original links to a file.') + args = parser.parse_args() - """todo: save in tmp""" + # todo: save in tmp h = httplib2.Http('.cache') response, content = h.request(args.site) - s = BeautifulSoup(content,features='html.parser') + s = BeautifulSoup(content, features='html.parser') - """find only file names""" + # if args.debug: + # print(s.prettify()) + links = s.find_all(href=re.compile(args.filter)) if not links or links == 0: - exit("No filenames found with the given filter.") + exit("No file names found with the given filter.") if args.summary: + # grab only the extension from splitext + extensions = [] + has_jpeg = False + for link in links: + filename, extension = os.path.splitext(link['href']) + if extension: + if extension == '.jpeg': + has_jpeg = True + extensions.append(extension) + if args.debug: + print(f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}') + # todo: merge jpeg and jpg into one extension (display as: xxx jpg (yyy as jpeg)) print(f'URL: {args.site}') - print(f'Total filenames: {len(links)}') - """create a list from extensions""" - exts = [os.path.splitext(ext['href'])[1] for ext in links] + print(f'Total file names: {len(links)}') print('File types:') - for ext, count in Counter(exts).items(): - print(f'{count}\t{ext[1:]}') + # todo: in the counts of extensions mention varying capitalisation + extension_count = Counter(extensions) + if has_jpeg: + + for extension, count in extension_count.items(): + # do not print the first character (dot) of extension + if not has_jpeg: + print(f'{count}\t{extension[1:]}') + else: + if extension == '.jpg' or extension == '.jpeg': + print(f"{count}\t{extension[1:]} (with some as .jpeg") exit(0) - for link in links: - print(args.site + link['href']) + # we got to the end, print out all the links + link_list = [f'{args.site}{link["href"]}' for link in links] + for link in link_list: + print(f"{link}") + if args.save_report: + with open(args.save_report, 'w') as file: + # todo: create summary object that will be printed with --summary or saved to file with --save-summary + print(link_list, file=file)