shovel/main.py

73 lines
2.9 KiB
Python
Raw Permalink Normal View History

2020-06-28 19:11:24 +00:00
import httplib2
import re
2020-06-28 19:20:52 +00:00
import argparse
2020-06-28 20:28:06 +00:00
import os.path
from collections import Counter
2020-06-28 19:11:24 +00:00
from bs4 import BeautifulSoup
if __name__ == '__main__':
2021-03-18 16:47:37 +00:00
parser = argparse.ArgumentParser(
description="Dig out links from a website.")
parser.add_argument(
'site', type=str, help="Website that you want to scrape for links.")
2020-07-29 20:21:53 +00:00
parser.add_argument('-f', '--filter', type=str, default=r'..*$',
help="Only return file names matching this regular expression.")
2021-03-18 16:47:37 +00:00
parser.add_argument('-s', '--summary', action='store_true',
help="Print summary and quit.")
parser.add_argument('-d', '--debug', action='store_true',
help="Print out debugging information.")
2020-07-29 20:21:53 +00:00
parser.add_argument('-r', '--save-report', action='store_true',
help='Save report of all the original links to a file.')
2021-03-18 16:47:37 +00:00
2020-06-28 19:20:52 +00:00
args = parser.parse_args()
2021-03-18 16:47:37 +00:00
2020-07-29 20:21:53 +00:00
# todo: save in tmp
2020-06-28 19:11:24 +00:00
h = httplib2.Http('.cache')
2020-06-28 19:20:52 +00:00
response, content = h.request(args.site)
2021-03-18 16:47:37 +00:00
2020-07-29 20:21:53 +00:00
s = BeautifulSoup(content, features='html.parser')
2021-03-18 16:47:37 +00:00
2021-03-18 17:16:22 +00:00
if args.debug:
print(s.prettify())
2020-07-29 20:21:53 +00:00
2020-06-28 19:45:14 +00:00
links = s.find_all(href=re.compile(args.filter))
if not links or links == 0:
2020-07-29 20:21:53 +00:00
exit("No file names found with the given filter.")
2020-06-28 20:28:06 +00:00
if args.summary:
2020-07-29 20:21:53 +00:00
# grab only the extension from splitext
extensions = []
has_jpeg = False
for link in links:
filename, extension = os.path.splitext(link['href'])
if extension:
if extension == '.jpeg':
has_jpeg = True
extensions.append(extension)
if args.debug:
2021-03-18 16:47:37 +00:00
print(
f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}')
2020-07-29 20:21:53 +00:00
# todo: merge jpeg and jpg into one extension (display as: xxx jpg (yyy as jpeg))
2020-06-28 20:28:06 +00:00
print(f'URL: {args.site}')
2020-07-29 20:21:53 +00:00
print(f'Total file names: {len(links)}')
2020-06-28 20:28:06 +00:00
print('File types:')
2020-07-29 20:21:53 +00:00
# todo: in the counts of extensions mention varying capitalisation
extension_count = Counter(extensions)
if has_jpeg:
2021-03-18 16:47:37 +00:00
for extension, count in extension_count.items():
# do not print the first character (dot) of extension
if not has_jpeg:
print(f'{count}\t{extension[1:]}')
else:
if extension == '.jpg' or extension == '.jpeg':
print(f"{count}\t{extension[1:]} (with some as .jpeg")
exit(0)
2020-07-29 20:21:53 +00:00
# we got to the end, print out all the links
link_list = [f'{args.site}{link["href"]}' for link in links]
for link in link_list:
print(f"{link}")
if args.save_report:
with open(args.save_report, 'w') as file:
# todo: create summary object that will be printed with --summary or saved to file with --save-summary
print(link_list, file=file)