Add summary function.
This commit is contained in:
parent
b3a74584e9
commit
914498121c
13
main.py
13
main.py
|
@ -1,14 +1,18 @@
|
||||||
import httplib2
|
import httplib2
|
||||||
import re
|
import re
|
||||||
import argparse
|
import argparse
|
||||||
|
import os.path
|
||||||
|
from collections import Counter
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description="Dig out links from a website.")
|
parser = argparse.ArgumentParser(description="Dig out links from a website.")
|
||||||
parser.add_argument('site', type=str, help="Website that you want to scrape for links.")
|
parser.add_argument('site', type=str, help="Website that you want to scrape for links.")
|
||||||
parser.add_argument('-f', '--filter', type=str, default='\..*$', help="Only return filenames matching this regular expression.")
|
parser.add_argument('-f', '--filter', type=str, default='\..*$', help="Only return filenames matching this regular expression.")
|
||||||
|
parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
"""todo: save in tmp"""
|
||||||
h = httplib2.Http('.cache')
|
h = httplib2.Http('.cache')
|
||||||
response, content = h.request(args.site)
|
response, content = h.request(args.site)
|
||||||
|
|
||||||
|
@ -18,5 +22,14 @@ if __name__ == '__main__':
|
||||||
links = s.find_all(href=re.compile(args.filter))
|
links = s.find_all(href=re.compile(args.filter))
|
||||||
if not links or links == 0:
|
if not links or links == 0:
|
||||||
exit("No filenames found with the given filter.")
|
exit("No filenames found with the given filter.")
|
||||||
|
if args.summary:
|
||||||
|
print(f'URL: {args.site}')
|
||||||
|
print(f'Total filenames: {len(links)}')
|
||||||
|
"""create a list from extensions"""
|
||||||
|
exts = [os.path.splitext(ext['href'])[1] for ext in links]
|
||||||
|
print('File types:')
|
||||||
|
for ext, count in Counter(exts).items():
|
||||||
|
print(f'{count}\t{ext[1:]}')
|
||||||
|
exit(0)
|
||||||
for link in links:
|
for link in links:
|
||||||
print(args.site + link['href'])
|
print(args.site + link['href'])
|
||||||
|
|
Loading…
Reference in New Issue