import httplib2 import re import argparse import os.path from collections import Counter from bs4 import BeautifulSoup if __name__ == '__main__': parser = argparse.ArgumentParser(description="Dig out links from a website.") parser.add_argument('site', type=str, help="Website that you want to scrape for links.") parser.add_argument('-f', '--filter', type=str, default='\..*$', help="Only return filenames matching this regular expression.") parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.") args = parser.parse_args() """todo: save in tmp""" h = httplib2.Http('.cache') response, content = h.request(args.site) s = BeautifulSoup(content,features='html.parser') """find only file names""" links = s.find_all(href=re.compile(args.filter)) if not links or links == 0: exit("No filenames found with the given filter.") if args.summary: print(f'URL: {args.site}') print(f'Total filenames: {len(links)}') """create a list from extensions""" exts = [os.path.splitext(ext['href'])[1] for ext in links] print('File types:') for ext, count in Counter(exts).items(): print(f'{count}\t{ext[1:]}') exit(0) for link in links: print(args.site + link['href'])