import httplib2 import re import argparse import os.path from collections import Counter from bs4 import BeautifulSoup if __name__ == '__main__': parser = argparse.ArgumentParser(description="Dig out links from a website.") parser.add_argument('site', type=str, help="Website that you want to scrape for links.") parser.add_argument('-f', '--filter', type=str, default=r'..*$', help="Only return file names matching this regular expression.") parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.") parser.add_argument('-d', '--debug', action='store_true', help="Print out debugging information.") parser.add_argument('-r', '--save-report', action='store_true', help='Save report of all the original links to a file.') args = parser.parse_args() # todo: save in tmp h = httplib2.Http('.cache') response, content = h.request(args.site) s = BeautifulSoup(content, features='html.parser') # if args.debug: # print(s.prettify()) links = s.find_all(href=re.compile(args.filter)) if not links or links == 0: exit("No file names found with the given filter.") if args.summary: # grab only the extension from splitext extensions = [] has_jpeg = False for link in links: filename, extension = os.path.splitext(link['href']) if extension: if extension == '.jpeg': has_jpeg = True extensions.append(extension) if args.debug: print(f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}') # todo: merge jpeg and jpg into one extension (display as: xxx jpg (yyy as jpeg)) print(f'URL: {args.site}') print(f'Total file names: {len(links)}') print('File types:') # todo: in the counts of extensions mention varying capitalisation extension_count = Counter(extensions) if has_jpeg: for extension, count in extension_count.items(): # do not print the first character (dot) of extension if not has_jpeg: print(f'{count}\t{extension[1:]}') else: if extension == '.jpg' or extension == '.jpeg': print(f"{count}\t{extension[1:]} (with some as .jpeg") exit(0) # we got to the end, print out all the links link_list = [f'{args.site}{link["href"]}' for link in links] for link in link_list: print(f"{link}") if args.save_report: with open(args.save_report, 'w') as file: # todo: create summary object that will be printed with --summary or saved to file with --save-summary print(link_list, file=file)