Save unfinished work.
This commit is contained in:
parent
914498121c
commit
1ffe83e493
60
main.py
60
main.py
|
@ -8,28 +8,60 @@ from bs4 import BeautifulSoup
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description="Dig out links from a website.")
|
parser = argparse.ArgumentParser(description="Dig out links from a website.")
|
||||||
parser.add_argument('site', type=str, help="Website that you want to scrape for links.")
|
parser.add_argument('site', type=str, help="Website that you want to scrape for links.")
|
||||||
parser.add_argument('-f', '--filter', type=str, default='\..*$', help="Only return filenames matching this regular expression.")
|
parser.add_argument('-f', '--filter', type=str, default=r'..*$',
|
||||||
|
help="Only return file names matching this regular expression.")
|
||||||
parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.")
|
parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.")
|
||||||
|
parser.add_argument('-d', '--debug', action='store_true', help="Print out debugging information.")
|
||||||
|
parser.add_argument('-r', '--save-report', action='store_true',
|
||||||
|
help='Save report of all the original links to a file.')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
"""todo: save in tmp"""
|
# todo: save in tmp
|
||||||
h = httplib2.Http('.cache')
|
h = httplib2.Http('.cache')
|
||||||
response, content = h.request(args.site)
|
response, content = h.request(args.site)
|
||||||
|
|
||||||
s = BeautifulSoup(content,features='html.parser')
|
s = BeautifulSoup(content, features='html.parser')
|
||||||
|
|
||||||
|
# if args.debug:
|
||||||
|
# print(s.prettify())
|
||||||
|
|
||||||
"""find only file names"""
|
|
||||||
links = s.find_all(href=re.compile(args.filter))
|
links = s.find_all(href=re.compile(args.filter))
|
||||||
if not links or links == 0:
|
if not links or links == 0:
|
||||||
exit("No filenames found with the given filter.")
|
exit("No file names found with the given filter.")
|
||||||
if args.summary:
|
if args.summary:
|
||||||
print(f'URL: {args.site}')
|
# grab only the extension from splitext
|
||||||
print(f'Total filenames: {len(links)}')
|
extensions = []
|
||||||
"""create a list from extensions"""
|
has_jpeg = False
|
||||||
exts = [os.path.splitext(ext['href'])[1] for ext in links]
|
|
||||||
print('File types:')
|
|
||||||
for ext, count in Counter(exts).items():
|
|
||||||
print(f'{count}\t{ext[1:]}')
|
|
||||||
exit(0)
|
|
||||||
for link in links:
|
for link in links:
|
||||||
print(args.site + link['href'])
|
filename, extension = os.path.splitext(link['href'])
|
||||||
|
if extension:
|
||||||
|
if extension == '.jpeg':
|
||||||
|
has_jpeg = True
|
||||||
|
extensions.append(extension)
|
||||||
|
if args.debug:
|
||||||
|
print(f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}')
|
||||||
|
# todo: merge jpeg and jpg into one extension (display as: xxx jpg (yyy as jpeg))
|
||||||
|
print(f'URL: {args.site}')
|
||||||
|
print(f'Total file names: {len(links)}')
|
||||||
|
print('File types:')
|
||||||
|
# todo: in the counts of extensions mention varying capitalisation
|
||||||
|
extension_count = Counter(extensions)
|
||||||
|
if has_jpeg:
|
||||||
|
|
||||||
|
for extension, count in extension_count.items():
|
||||||
|
# do not print the first character (dot) of extension
|
||||||
|
if not has_jpeg:
|
||||||
|
print(f'{count}\t{extension[1:]}')
|
||||||
|
else:
|
||||||
|
if extension == '.jpg' or extension == '.jpeg':
|
||||||
|
print(f"{count}\t{extension[1:]} (with some as .jpeg")
|
||||||
|
exit(0)
|
||||||
|
# we got to the end, print out all the links
|
||||||
|
link_list = [f'{args.site}{link["href"]}' for link in links]
|
||||||
|
for link in link_list:
|
||||||
|
print(f"{link}")
|
||||||
|
if args.save_report:
|
||||||
|
with open(args.save_report, 'w') as file:
|
||||||
|
# todo: create summary object that will be printed with --summary or saved to file with --save-summary
|
||||||
|
print(link_list, file=file)
|
||||||
|
|
Loading…
Reference in New Issue