Save unfinished work.
This commit is contained in:
		
							
								
								
									
										56
									
								
								main.py
									
									
									
									
									
								
							
							
						
						
									
										56
									
								
								main.py
									
									
									
									
									
								
							| @ -8,28 +8,60 @@ from bs4 import BeautifulSoup | |||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|     parser = argparse.ArgumentParser(description="Dig out links from a website.") |     parser = argparse.ArgumentParser(description="Dig out links from a website.") | ||||||
|     parser.add_argument('site', type=str, help="Website that you want to scrape for links.") |     parser.add_argument('site', type=str, help="Website that you want to scrape for links.") | ||||||
|     parser.add_argument('-f', '--filter', type=str, default='\..*$', help="Only return filenames matching this regular expression.") |     parser.add_argument('-f', '--filter', type=str, default=r'..*$', | ||||||
|  |                         help="Only return file names matching this regular expression.") | ||||||
|     parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.") |     parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.") | ||||||
|  |     parser.add_argument('-d', '--debug', action='store_true', help="Print out debugging information.") | ||||||
|  |     parser.add_argument('-r', '--save-report', action='store_true', | ||||||
|  |                         help='Save report of all the original links to a file.') | ||||||
|  |      | ||||||
|     args = parser.parse_args() |     args = parser.parse_args() | ||||||
|      |      | ||||||
|     """todo: save in tmp""" |     # todo: save in tmp | ||||||
|     h = httplib2.Http('.cache') |     h = httplib2.Http('.cache') | ||||||
|     response, content = h.request(args.site) |     response, content = h.request(args.site) | ||||||
|      |      | ||||||
|     s = BeautifulSoup(content,features='html.parser') |     s = BeautifulSoup(content, features='html.parser') | ||||||
|  |      | ||||||
|  |     # if args.debug: | ||||||
|  |     #     print(s.prettify()) | ||||||
|  |  | ||||||
|     """find only file names""" |  | ||||||
|     links = s.find_all(href=re.compile(args.filter)) |     links = s.find_all(href=re.compile(args.filter)) | ||||||
|     if not links or links == 0: |     if not links or links == 0: | ||||||
|         exit("No filenames found with the given filter.") |         exit("No file names found with the given filter.") | ||||||
|     if args.summary: |     if args.summary: | ||||||
|  |         # grab only the extension from splitext | ||||||
|  |         extensions = [] | ||||||
|  |         has_jpeg = False | ||||||
|  |         for link in links: | ||||||
|  |             filename, extension = os.path.splitext(link['href']) | ||||||
|  |             if extension: | ||||||
|  |                 if extension == '.jpeg': | ||||||
|  |                     has_jpeg = True | ||||||
|  |                 extensions.append(extension) | ||||||
|  |             if args.debug: | ||||||
|  |                 print(f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}') | ||||||
|  |         # todo: merge jpeg and jpg into one extension (display as: xxx jpg (yyy as jpeg)) | ||||||
|         print(f'URL: {args.site}') |         print(f'URL: {args.site}') | ||||||
|         print(f'Total filenames: {len(links)}') |         print(f'Total file names: {len(links)}') | ||||||
|         """create a list from extensions""" |  | ||||||
|         exts = [os.path.splitext(ext['href'])[1] for ext in links] |  | ||||||
|         print('File types:') |         print('File types:') | ||||||
|         for ext, count in Counter(exts).items(): |         # todo: in the counts of extensions mention varying capitalisation | ||||||
|             print(f'{count}\t{ext[1:]}') |         extension_count = Counter(extensions) | ||||||
|  |         if has_jpeg: | ||||||
|  |              | ||||||
|  |         for extension, count in extension_count.items(): | ||||||
|  |             # do not print the first character (dot) of extension | ||||||
|  |             if not has_jpeg: | ||||||
|  |                 print(f'{count}\t{extension[1:]}') | ||||||
|  |             else: | ||||||
|  |                 if extension == '.jpg' or extension == '.jpeg': | ||||||
|  |                     print(f"{count}\t{extension[1:]} (with some as .jpeg") | ||||||
|         exit(0) |         exit(0) | ||||||
|     for link in links: |     # we got to the end, print out all the links | ||||||
|         print(args.site + link['href']) |     link_list = [f'{args.site}{link["href"]}' for link in links] | ||||||
|  |     for link in link_list: | ||||||
|  |         print(f"{link}") | ||||||
|  |     if args.save_report: | ||||||
|  |         with open(args.save_report, 'w') as file: | ||||||
|  |             # todo: create summary object that will be printed with --summary or saved to file with --save-summary | ||||||
|  |             print(link_list, file=file) | ||||||
|  | |||||||
		Reference in New Issue
	
	Block a user
	 Lukáš Kucharczyk
					Lukáš Kucharczyk