auto-remove proxy.lib.pdx.edu from urls

This commit is contained in:
Bryan Bishop 2013-01-19 19:33:17 -06:00
parent 9c7da548e1
commit cf7c1b78e1

View File

@ -42,6 +42,8 @@ def download(phenny, input, verbose=True):
if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"): if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
return return
for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line): for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
line = filter_fix(line)
translation_url = "http://localhost:1969/web" translation_url = "http://localhost:1969/web"
headers = { headers = {
@ -234,3 +236,11 @@ def extract_meta_content(tree, meta_name):
else: else:
return content return content
def filter_fix(url):
"""
Fixes some common problems in urls.
"""
if ".proxy.lib.pdx.edu" in url:
url = url.replace(".proxy.lib.pdx.edu", "")
return url