Update formatting
This commit is contained in:
		
							
								
								
									
										41
									
								
								main.py
									
									
									
									
									
								
							
							
						
						
									
										41
									
								
								main.py
									
									
									
									
									
								
							| @ -6,23 +6,27 @@ from collections import Counter | ||||
| from bs4 import BeautifulSoup | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     parser = argparse.ArgumentParser(description="Dig out links from a website.") | ||||
|     parser.add_argument('site', type=str, help="Website that you want to scrape for links.") | ||||
|     parser = argparse.ArgumentParser( | ||||
|         description="Dig out links from a website.") | ||||
|     parser.add_argument( | ||||
|         'site', type=str, help="Website that you want to scrape for links.") | ||||
|     parser.add_argument('-f', '--filter', type=str, default=r'..*$', | ||||
|                         help="Only return file names matching this regular expression.") | ||||
|     parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.") | ||||
|     parser.add_argument('-d', '--debug', action='store_true', help="Print out debugging information.") | ||||
|     parser.add_argument('-s', '--summary', action='store_true', | ||||
|                         help="Print summary and quit.") | ||||
|     parser.add_argument('-d', '--debug', action='store_true', | ||||
|                         help="Print out debugging information.") | ||||
|     parser.add_argument('-r', '--save-report', action='store_true', | ||||
|                         help='Save report of all the original links to a file.') | ||||
|      | ||||
|  | ||||
|     args = parser.parse_args() | ||||
|      | ||||
|  | ||||
|     # todo: save in tmp | ||||
|     h = httplib2.Http('.cache') | ||||
|     response, content = h.request(args.site) | ||||
|      | ||||
|  | ||||
|     s = BeautifulSoup(content, features='html.parser') | ||||
|      | ||||
|  | ||||
|     # if args.debug: | ||||
|     #     print(s.prettify()) | ||||
|  | ||||
| @ -40,7 +44,8 @@ if __name__ == '__main__': | ||||
|                     has_jpeg = True | ||||
|                 extensions.append(extension) | ||||
|             if args.debug: | ||||
|                 print(f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}') | ||||
|                 print( | ||||
|                     f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}') | ||||
|         # todo: merge jpeg and jpg into one extension (display as: xxx jpg (yyy as jpeg)) | ||||
|         print(f'URL: {args.site}') | ||||
|         print(f'Total file names: {len(links)}') | ||||
| @ -48,15 +53,15 @@ if __name__ == '__main__': | ||||
|         # todo: in the counts of extensions mention varying capitalisation | ||||
|         extension_count = Counter(extensions) | ||||
|         if has_jpeg: | ||||
|              | ||||
|         for extension, count in extension_count.items(): | ||||
|             # do not print the first character (dot) of extension | ||||
|             if not has_jpeg: | ||||
|                 print(f'{count}\t{extension[1:]}') | ||||
|             else: | ||||
|                 if extension == '.jpg' or extension == '.jpeg': | ||||
|                     print(f"{count}\t{extension[1:]} (with some as .jpeg") | ||||
|         exit(0) | ||||
|  | ||||
|             for extension, count in extension_count.items(): | ||||
|                 # do not print the first character (dot) of extension | ||||
|                 if not has_jpeg: | ||||
|                     print(f'{count}\t{extension[1:]}') | ||||
|                 else: | ||||
|                     if extension == '.jpg' or extension == '.jpeg': | ||||
|                         print(f"{count}\t{extension[1:]} (with some as .jpeg") | ||||
|             exit(0) | ||||
|     # we got to the end, print out all the links | ||||
|     link_list = [f'{args.site}{link["href"]}' for link in links] | ||||
|     for link in link_list: | ||||
|  | ||||
		Reference in New Issue
	
	Block a user