Update formatting

This commit is contained in:
Lukáš Kucharczyk 2021-03-18 17:47:37 +01:00
parent 1ffe83e493
commit bf65cc41e7
No known key found for this signature in database
GPG Key ID: 65524498C0196B64
1 changed files with 23 additions and 18 deletions

41
main.py
View File

@ -6,23 +6,27 @@ from collections import Counter
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Dig out links from a website.") parser = argparse.ArgumentParser(
parser.add_argument('site', type=str, help="Website that you want to scrape for links.") description="Dig out links from a website.")
parser.add_argument(
'site', type=str, help="Website that you want to scrape for links.")
parser.add_argument('-f', '--filter', type=str, default=r'..*$', parser.add_argument('-f', '--filter', type=str, default=r'..*$',
help="Only return file names matching this regular expression.") help="Only return file names matching this regular expression.")
parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.") parser.add_argument('-s', '--summary', action='store_true',
parser.add_argument('-d', '--debug', action='store_true', help="Print out debugging information.") help="Print summary and quit.")
parser.add_argument('-d', '--debug', action='store_true',
help="Print out debugging information.")
parser.add_argument('-r', '--save-report', action='store_true', parser.add_argument('-r', '--save-report', action='store_true',
help='Save report of all the original links to a file.') help='Save report of all the original links to a file.')
args = parser.parse_args() args = parser.parse_args()
# todo: save in tmp # todo: save in tmp
h = httplib2.Http('.cache') h = httplib2.Http('.cache')
response, content = h.request(args.site) response, content = h.request(args.site)
s = BeautifulSoup(content, features='html.parser') s = BeautifulSoup(content, features='html.parser')
# if args.debug: # if args.debug:
# print(s.prettify()) # print(s.prettify())
@ -40,7 +44,8 @@ if __name__ == '__main__':
has_jpeg = True has_jpeg = True
extensions.append(extension) extensions.append(extension)
if args.debug: if args.debug:
print(f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}') print(
f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}')
# todo: merge jpeg and jpg into one extension (display as: xxx jpg (yyy as jpeg)) # todo: merge jpeg and jpg into one extension (display as: xxx jpg (yyy as jpeg))
print(f'URL: {args.site}') print(f'URL: {args.site}')
print(f'Total file names: {len(links)}') print(f'Total file names: {len(links)}')
@ -48,15 +53,15 @@ if __name__ == '__main__':
# todo: in the counts of extensions mention varying capitalisation # todo: in the counts of extensions mention varying capitalisation
extension_count = Counter(extensions) extension_count = Counter(extensions)
if has_jpeg: if has_jpeg:
for extension, count in extension_count.items(): for extension, count in extension_count.items():
# do not print the first character (dot) of extension # do not print the first character (dot) of extension
if not has_jpeg: if not has_jpeg:
print(f'{count}\t{extension[1:]}') print(f'{count}\t{extension[1:]}')
else: else:
if extension == '.jpg' or extension == '.jpeg': if extension == '.jpg' or extension == '.jpeg':
print(f"{count}\t{extension[1:]} (with some as .jpeg") print(f"{count}\t{extension[1:]} (with some as .jpeg")
exit(0) exit(0)
# we got to the end, print out all the links # we got to the end, print out all the links
link_list = [f'{args.site}{link["href"]}' for link in links] link_list = [f'{args.site}{link["href"]}' for link in links]
for link in link_list: for link in link_list: