From cf7c1b78e124495ac51bc43fcef79cdbef9e6747 Mon Sep 17 00:00:00 2001 From: Bryan Bishop Date: Sat, 19 Jan 2013 19:33:17 -0600 Subject: [PATCH] auto-remove proxy.lib.pdx.edu from urls --- modules/papers.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/modules/papers.py b/modules/papers.py index 4f6d194..e0d13c7 100644 --- a/modules/papers.py +++ b/modules/papers.py @@ -42,6 +42,8 @@ def download(phenny, input, verbose=True): if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"): return for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line): + line = filter_fix(line) + translation_url = "http://localhost:1969/web" headers = { @@ -234,3 +236,11 @@ def extract_meta_content(tree, meta_name): else: return content +def filter_fix(url): + """ + Fixes some common problems in urls. + """ + if ".proxy.lib.pdx.edu" in url: + url = url.replace(".proxy.lib.pdx.edu", "") + return url +