Compare commits
3 Commits
1ffe83e493
...
main
Author | SHA1 | Date | |
---|---|---|---|
7952dd37d4 | |||
8835c0f919 | |||
bf65cc41e7 |
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
.cache
|
19
main.py
19
main.py
@ -6,12 +6,16 @@ from collections import Counter
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Dig out links from a website.")
|
||||
parser.add_argument('site', type=str, help="Website that you want to scrape for links.")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Dig out links from a website.")
|
||||
parser.add_argument(
|
||||
'site', type=str, help="Website that you want to scrape for links.")
|
||||
parser.add_argument('-f', '--filter', type=str, default=r'..*$',
|
||||
help="Only return file names matching this regular expression.")
|
||||
parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.")
|
||||
parser.add_argument('-d', '--debug', action='store_true', help="Print out debugging information.")
|
||||
parser.add_argument('-s', '--summary', action='store_true',
|
||||
help="Print summary and quit.")
|
||||
parser.add_argument('-d', '--debug', action='store_true',
|
||||
help="Print out debugging information.")
|
||||
parser.add_argument('-r', '--save-report', action='store_true',
|
||||
help='Save report of all the original links to a file.')
|
||||
|
||||
@ -23,8 +27,8 @@ if __name__ == '__main__':
|
||||
|
||||
s = BeautifulSoup(content, features='html.parser')
|
||||
|
||||
# if args.debug:
|
||||
# print(s.prettify())
|
||||
if args.debug:
|
||||
print(s.prettify())
|
||||
|
||||
links = s.find_all(href=re.compile(args.filter))
|
||||
if not links or links == 0:
|
||||
@ -40,7 +44,8 @@ if __name__ == '__main__':
|
||||
has_jpeg = True
|
||||
extensions.append(extension)
|
||||
if args.debug:
|
||||
print(f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}')
|
||||
print(
|
||||
f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}')
|
||||
# todo: merge jpeg and jpg into one extension (display as: xxx jpg (yyy as jpeg))
|
||||
print(f'URL: {args.site}')
|
||||
print(f'Total file names: {len(links)}')
|
||||
|
Reference in New Issue
Block a user