Add summary function.

This commit is contained in:
Lukáš Kucharczyk 2020-06-28 22:28:06 +02:00
parent b3a74584e9
commit 914498121c
1 changed files with 13 additions and 0 deletions

13
main.py
View File

@ -1,14 +1,18 @@
import httplib2 import httplib2
import re import re
import argparse import argparse
import os.path
from collections import Counter
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Dig out links from a website.") parser = argparse.ArgumentParser(description="Dig out links from a website.")
parser.add_argument('site', type=str, help="Website that you want to scrape for links.") parser.add_argument('site', type=str, help="Website that you want to scrape for links.")
parser.add_argument('-f', '--filter', type=str, default='\..*$', help="Only return filenames matching this regular expression.") parser.add_argument('-f', '--filter', type=str, default='\..*$', help="Only return filenames matching this regular expression.")
parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.")
args = parser.parse_args() args = parser.parse_args()
"""todo: save in tmp"""
h = httplib2.Http('.cache') h = httplib2.Http('.cache')
response, content = h.request(args.site) response, content = h.request(args.site)
@ -18,5 +22,14 @@ if __name__ == '__main__':
links = s.find_all(href=re.compile(args.filter)) links = s.find_all(href=re.compile(args.filter))
if not links or links == 0: if not links or links == 0:
exit("No filenames found with the given filter.") exit("No filenames found with the given filter.")
if args.summary:
print(f'URL: {args.site}')
print(f'Total filenames: {len(links)}')
"""create a list from extensions"""
exts = [os.path.splitext(ext['href'])[1] for ext in links]
print('File types:')
for ext, count in Counter(exts).items():
print(f'{count}\t{ext[1:]}')
exit(0)
for link in links: for link in links:
print(args.site + link['href']) print(args.site + link['href'])