diff --git a/main.py b/main.py index f838b01..59da35e 100644 --- a/main.py +++ b/main.py @@ -1,14 +1,18 @@ import httplib2 import re import argparse +import os.path +from collections import Counter from bs4 import BeautifulSoup if __name__ == '__main__': parser = argparse.ArgumentParser(description="Dig out links from a website.") parser.add_argument('site', type=str, help="Website that you want to scrape for links.") parser.add_argument('-f', '--filter', type=str, default='\..*$', help="Only return filenames matching this regular expression.") + parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.") args = parser.parse_args() + """todo: save in tmp""" h = httplib2.Http('.cache') response, content = h.request(args.site) @@ -18,5 +22,14 @@ if __name__ == '__main__': links = s.find_all(href=re.compile(args.filter)) if not links or links == 0: exit("No filenames found with the given filter.") + if args.summary: + print(f'URL: {args.site}') + print(f'Total filenames: {len(links)}') + """create a list from extensions""" + exts = [os.path.splitext(ext['href'])[1] for ext in links] + print('File types:') + for ext, count in Counter(exts).items(): + print(f'{count}\t{ext[1:]}') + exit(0) for link in links: print(args.site + link['href'])