shovel/main.py

23 lines
822 B
Python
Raw Normal View History

2020-06-28 19:11:24 +00:00
import httplib2
import re
2020-06-28 19:20:52 +00:00
import argparse
2020-06-28 19:11:24 +00:00
from bs4 import BeautifulSoup
if __name__ == '__main__':
2020-06-28 19:20:52 +00:00
parser = argparse.ArgumentParser(description="Dig out links from a website.")
parser.add_argument('site', type=str, help="Website that you want to scrape for links.")
2020-06-28 19:45:14 +00:00
parser.add_argument('-f', '--filter', type=str, default='\..*$', help="Only return filenames matching this regular expression.")
2020-06-28 19:20:52 +00:00
args = parser.parse_args()
2020-06-28 19:11:24 +00:00
h = httplib2.Http('.cache')
2020-06-28 19:20:52 +00:00
response, content = h.request(args.site)
2020-06-28 19:46:55 +00:00
s = BeautifulSoup(content,features='html.parser')
2020-06-28 19:11:24 +00:00
"""find only file names"""
2020-06-28 19:45:14 +00:00
links = s.find_all(href=re.compile(args.filter))
if not links or links == 0:
exit("No filenames found with the given filter.")
2020-06-28 19:11:24 +00:00
for link in links:
2020-06-28 19:20:52 +00:00
print(args.site + link['href'])