shovel/main.py

import httplib2
import re
import argparse
import os.path
from collections import Counter
from bs4 import BeautifulSoup

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Dig out links from a website.")
    parser.add_argument('site', type=str, help="Website that you want to scrape for links.")
    parser.add_argument('-f', '--filter', type=str, default='\..*$', help="Only return filenames matching this regular expression.")
    parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.")
    args = parser.parse_args()

    """todo: save in tmp"""
    h = httplib2.Http('.cache')
    response, content = h.request(args.site)

    s = BeautifulSoup(content,features='html.parser')

    """find only file names"""
    links = s.find_all(href=re.compile(args.filter))
    if not links or links == 0:
        exit("No filenames found with the given filter.")
    if args.summary:
        print(f'URL: {args.site}')
        print(f'Total filenames: {len(links)}')
        """create a list from extensions"""
        exts = [os.path.splitext(ext['href'])[1] for ext in links]
        print('File types:')
        for ext, count in Counter(exts).items():
            print(f'{count}\t{ext[1:]}')
        exit(0)
    for link in links:
        print(args.site + link['href'])