From 02e679bc72804423d63fbede2c032aa1280628b0 Mon Sep 17 00:00:00 2001 From: Phyks Date: Sat, 26 Apr 2014 11:52:19 +0200 Subject: [PATCH] Download of papers working You should pass the url of the pdf file to the script, along with the `download` parameter. It will try the proxies in the `params.py` file, until it finds one that allow him to get the pdf file. TODO : Use pdfparanoia to remove watermarks --- README.md | 19 +- fetcher.py | 465 ++------------------------------------------- main.py | 46 ++++- translation-server | 1 - 4 files changed, 58 insertions(+), 473 deletions(-) delete mode 160000 translation-server diff --git a/README.md b/README.md index 66c9091..8e21769 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,9 @@ BiblioManager will always use standard formats such as BibTeX, so that you can e ## Current status * Able to import a PDF / djvu file, automagically find the DOI / ISBN, get the bibtex entry back and add it to the library. If DOI / ISBN search fails, it will prompt you for it. +* Able to download a URL, using any specified proxy (you can list many and it will try all of them) and store the pdf file with its metadata. + +Should be almost working and usable now, although still to be considered as **experimental**. **Important note :** I use it for personnal use, but I don't read articles from many journals. If you find any file which is not working, please fill an issue or send me an e-mail with the relevant information. There are alternative ways to get the metadata for example, and I didn't know really which one was the best one as writing this code. @@ -32,29 +35,16 @@ TODO -- To be updated Install pdfminer, pdfparanoia (via pip) and requesocks. -Init the submodules and install Zotero translation server. Copy params.py.example as params.py and customize it. Install pdftotext. Install djvulibre to use djvu files. Install isbntools with pip. -## Paperbot - -Paperbot is a command line utility that fetches academic papers. When given a URL on stdin or as a CLI argument, it fetches the content and returns a public link on stdout. This seems to help enhance the quality of discussion and make us less ignorant. - -All content is scraped using [zotero/translators](https://github.com/zotero/translators). These are javascript scrapers that work on a large number of academic publisher sites and are actively maintained. Paperbot offloads links to [zotero/translation-server](https://github.com/zotero/translation-server), which runs the zotero scrapers headlessly in a gecko and xulrunner environment. The scrapers return metadata and a link to the pdf. Then paperbot fetches that particular pdf. When given a link straight to a pdf, which paperbot is also happy to compulsively archive it. - -I kept part of the code to handle pdf downloading, and added a backend behind it. - -Paperbot can try multiple instances of translation-server (configured to use different ways to access content) and different SOCKS proxies to retrieve the content. - - ## Used source codes -* [zotero/translators](https://github.com/zotero/translators) : Links finder -* [zotero/translation-server](https://github.com/zotero/translation-server) : Links finder * [pdfparanoia](https://github.com/kanzure/pdfparanoia) : Watermark removal +* [paperbot](https://github.com/kanzure/paperbot) although my fetching of papers is way more basic ## License @@ -71,6 +61,7 @@ TODO A list of ideas and TODO. Don't hesitate to give feedback on the ones you really want or to propose your owns. +* pdfparanoia to remove the watermarks on pdf files * Webserver interface * Various re.compile ? * check output of subprocesses before it ends diff --git a/fetcher.py b/fetcher.py index b5ae66e..954aedb 100755 --- a/fetcher.py +++ b/fetcher.py @@ -1,462 +1,25 @@ #!/usr/bin/python2 -u # coding=utf8 + """ Fetches papers. """ -import re -import os -import json -import params -import random + import requesocks as requests -import lxml.etree -import sys -from time import time -from StringIO import StringIO +import params -import pdfparanoia +def download_url(url): + for proxy in params.proxies: + r_proxy = { + "http": proxy, + "https": proxy, + } -def download_proxy(line, zotero, proxy, verbose=True): - sys.stderr.write("attempting download of %s through %s and %s\n" % - (line, zotero, proxy)) + r = requests.get(url, proxies=r_proxy) - headers = { - "Content-Type": "application/json", - } - - data = { - "url": line, - "sessionid": "what" - } - - data = json.dumps(data) - - response = requests.post(zotero, data=data, headers=headers) - - if response.status_code != 200 or response.content == "[]": - sys.stderr.write("no valid reply from zotero\n") - sys.stderr.write("status %d\n" % response.status_code) - sys.stderr.write("content %s\n" % response.content) - return -1 # fatal - - sys.stderr.write("content %s\n" % response.content) - # see if there are any attachments - content = json.loads(response.content) - item = content[0] - title = item["title"] - - if not item.has_key("attachments"): - sys.stderr.write("no attachement with this proxy\n") - return 1 # try another proxy - - pdf_url = None - for attachment in item["attachments"]: - if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]: - pdf_url = attachment["url"] - break - - if not pdf_url: - sys.stderr.write("no PDF attachement with this proxy\n") - return 1 # try another proxy - - user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11" - - headers = { - "User-Agent": user_agent, - } - - sys.stderr.write("try retrieving " + - str(pdf_url) + " through proxy " + proxy + "\n") - response = None - session = requests.Session() - session.proxies = { - 'http': proxy, - 'https': proxy} - - try: - if pdf_url.startswith("https://"): - response = session.get(pdf_url, headers=headers, verify=False) - else: - response = session.get(pdf_url, headers=headers) - except requests.exceptions.ConnectionError: - sys.stderr.write("network failure on download " + - str(pdf_url) + "\n") - return 1 - - # detect failure - if response.status_code == 401: - sys.stderr.write("HTTP 401 unauthorized when trying to fetch " + - str(pdf_url) + "\n") - return 1 - elif response.status_code != 200: - sys.stderr.write("HTTP " + str(response.status_code) - + " when trying to fetch " + str(pdf_url) + "\n") - return 1 - - data = response.content - - if "pdf" in response.headers["content-type"]: - try: - data = pdfparanoia.scrub(StringIO(data)) - except: - # this is to avoid a PDFNotImplementedError - pass - - # grr.. - title = title.encode("ascii", "ignore") - title = title.replace(" ", "_") - title = title[:params.maxlen] - - path = os.path.join(params.folder, title + ".pdf") - - file_handler = open(path, "w") - file_handler.write(data) - file_handler.close() - - filename = requests.utils.quote(title) - - # Remove an ending period, which sometimes happens when the - # title of the paper has a period at the end. - if filename[-1] == ".": - filename = filename[:-1] - - url = params.url + filename + ".pdf" - - print(url) - return 0 - - -def download(line, verbose=True): - """ - Downloads a paper. - """ - - # don't bother if there's nothing there - if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"): - return - for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line): - line = filter_fix(line) - - # fix for login.jsp links to ieee xplore - line = fix_ieee_login_urls(line) - line = fix_jstor_pdf_urls(line) - - ok = False - - for (zotero, proxy) in params.servers: - s = download_proxy(line, zotero, proxy, verbose) - if s < 0: - break - if s == 0: - ok = True - break - if not ok: - for (zotero, proxy) in params.servers: - s = download_url(line, proxy) - sys.stderr.write("return code " + str(s) + "\n") - if s == 0: - ok = True - break - if not ok: - s = download_url(line, params.servers[0][1], last_resort=True) - if s != 0: - print "couldn't get it at all :(" - - return - -download.commands = ["fetch", "get", "download"] -download.priority = "high" -download.rule = r'(.*)' - -def download_ieee(url): - """ - Downloads an IEEE paper. The Zotero translator requires frames/windows to - be available. Eventually translation-server will be fixed, but until then - it might be nice to have an IEEE workaround. - """ - # url = "http://ieeexplore.ieee.org:80/xpl/freeabs_all.jsp?reload=true&arnumber=901261" - # url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261" - raise NotImplementedError - -def download_url(url, proxy, last_resort=False): - sys.stderr.write("attempting direct for %s through %s\n" % (url, - proxy)) - - session = requests.Session() - session.proxies = { - 'http': proxy, - 'https': proxy} - - try: - response = session.get(url, headers={"User-Agent": "origami-pdf"}) - except requests.exceptions.ConnectionError: - sys.stderr.write("network failure on download " + - str(url) + "\n") - return 1 - - content = response.content - - # just make up a default filename - title = "%0.2x" % random.getrandbits(128) - - # default extension - extension = ".txt" - - if "pdf" in response.headers["content-type"]: - extension = ".pdf" - elif check_if_html(response): - # parse the html string with lxml.etree - tree = parse_html(content) - - # extract some metadata with xpaths - citation_pdf_url = find_citation_pdf_url(tree, url) - citation_title = find_citation_title(tree) - - # aip.org sucks, citation_pdf_url is wrong - if citation_pdf_url and "link.aip.org/" in citation_pdf_url: - citation_pdf_url = None - - if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url: - content = session.get(citation_pdf_url).content - tree = parse_html(content) - # citation_title = ... - - # wow, this seriously needs to be cleaned up - if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url: - citation_title = citation_title.encode("ascii", "ignore") - response = session.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"}) - content = response.content - if "pdf" in response.headers["content-type"]: - extension = ".pdf" - title = citation_title - else: - if "sciencedirect.com" in url and not "ShoppingCart" in url: - try: - title = tree.xpath("//h1[@class='svTitle']")[0].text - pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0] - new_response = session.get(pdf_url, headers={"User-Agent": "sdf-macross"}) - new_content = new_response.content - if "pdf" in new_response.headers["content-type"]: - extension = ".pdf" - except Exception: - pass - else: - content = new_content - response = new_response - elif "jstor.org/" in url: - # clean up the url - if "?" in url: - url = url[0:url.find("?")] - - # not all pages have the element - try: - title = tree.xpath("//div[@class='hd title']")[0].text - except Exception: - try: - title = tree.xpath("//input[@name='ppv-title']/@value")[0] - except Exception: - pass - - # get the document id - document_id = None - if url[-1] != "/": - #if "stable/" in url: - #elif "discover/" in url: - #elif "action/showShelf?candidate=" in url: - #elif "pss/" in url: - document_id = url.split("/")[-1] - - if document_id.isdigit(): - try: - pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true" - new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.1"}) - new_content = new_response.content - if "pdf" in new_response.headers["content-type"]: - extension = ".pdf" - except Exception: - pass - else: - content = new_content - response = new_response - elif ".aip.org/" in url: - try: - title = tree.xpath("//title/text()")[0].split(" | ")[0] - pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0] - new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.0"}) - new_content = new_response.content - if "pdf" in new_response.headers["content-type"]: - extension = ".pdf" - except Exception: - pass - else: - content = new_content - response = new_response - elif "ieeexplore.ieee.org" in url: - try: - pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0] - new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/2.0"}) - new_content = new_response.content - if "pdf" in new_response.headers["content-type"]: - extension = ".pdf" - except Exception: - pass - else: - content = new_content - response = new_response - elif "h1 class=\"articleTitle" in content: - try: - title = tree.xpath("//h1[@class='articleTitle']")[0].text - title = title.encode("ascii", "ignore") - pdf_url = tree.xpath("//a[@title='View the Full Text PDF']/@href")[0] - except: - pass - else: - if pdf_url.startswith("/"): - url_start = url[:url.find("/",8)] - pdf_url = url_start + pdf_url - response = session.get(pdf_url, headers={"User-Agent": "pdf-teapot"}) - content = response.content - if "pdf" in response.headers["content-type"]: - extension = ".pdf" - # raise Exception("problem with citation_pdf_url or citation_title") - # well, at least save the contents from the original url - pass - - # make the title again just in case - if not title: - title = "%0.2x" % random.getrandbits(128) - - # can't create directories - title = title.replace("/", "_") - title = title.replace(" ", "_") - title = title[:params.maxlen] - - path = os.path.join(params.folder, title + extension) - - if extension in [".pdf", "pdf"]: - try: - sys.stderr.write("got it! " + - str(url) + "\n") - content = pdfparanoia.scrub(StringIO(content)) - except: - # this is to avoid a PDFNotImplementedError - pass - - file_handler = open(path, "w") - file_handler.write(content) - file_handler.close() - - title = title.encode("ascii", "ignore") - url = params.url + requests.utils.quote(title) + extension - - if extension in [".pdf", "pdf"]: - print url - return 0 - else: - sys.stderr.write("couldn't find it, dump: %s\n" % url) - if last_resort: - print "couldn't find it, dump: %s" % url - else: - return 1 - return 0 - - -def parse_html(content): - if not isinstance(content, StringIO): - content = StringIO(content) - parser = lxml.etree.HTMLParser() - tree = lxml.etree.parse(content, parser) - return tree - -def check_if_html(response): - return "text/html" in response.headers["content-type"] - -def find_citation_pdf_url(tree, url): - """ - Returns the content attribute. - """ - citation_pdf_url = extract_meta_content(tree, "citation_pdf_url") - if citation_pdf_url and not citation_pdf_url.startswith("http"): - if citation_pdf_url.startswith("/"): - url_start = url[:url.find("/",8)] - citation_pdf_url = url_start + citation_pdf_url - else: - raise Exception("unhandled situation (citation_pdf_url)") - return citation_pdf_url - -def find_citation_title(tree): - """ - Returns the content attribute. - """ - citation_title = extract_meta_content(tree, "citation_title") - return citation_title - -def extract_meta_content(tree, meta_name): - try: - content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0] - except: - return None - else: - return content - -def filter_fix(url): - """ - Fixes some common problems in urls. - """ - if ".proxy.lib.pdx.edu" in url: - url = url.replace(".proxy.lib.pdx.edu", "") - return url - -def fix_ieee_login_urls(url): - """ - Fixes urls point to login.jsp on IEEE Xplore. When someone browses to the - abstracts page on IEEE Xplore, they are sometimes sent to the login.jsp - page, and then this link is given to paperbot. The actual link is based on - the arnumber. - - example: - http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=806324&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D806324 - """ - if "ieeexplore.ieee.org/xpl/login.jsp" in url: - if "arnumber=" in url: - parts = url.split("arnumber=") - - # i guess the url might not look like the example in the docstring - if "&" in parts[1]: - arnumber = parts[1].split("&")[0] - else: - arnumber = parts[1] - - return "http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=" + arnumber - - # default case when things go wrong - return url - -def fix_jstor_pdf_urls(url): - """ - Fixes urls pointing to jstor pdfs. - """ - if "jstor.org/" in url: - if ".pdf" in url and not "?acceptTC=true" in url: - url += "?acceptTC=true" - return url - -if __name__ == '__main__': - if len(sys.argv) > 1: - for a in sys.argv[1:]: - download(a) - else: - reqs = [] - while True: - l = sys.stdin.readline() - if not l: - break - reqs.append(time()) - if len(reqs) > params.thresh: - delay = time() - reqs[len(reqs) - params.thresh + 1] - if params.limit - delay > 0: - print "rate limit exceeded, try again in %d second(s)" % (params.limit - delay) - else: - download(l) + if r.status_code != 200 or 'pdf' not in r.headers['content-type']: + continue + return r.content + return False diff --git a/main.py b/main.py index 06e567f..2b6c9fc 100755 --- a/main.py +++ b/main.py @@ -3,6 +3,7 @@ from __future__ import print_function +import fetcher import sys import shutil import requests @@ -298,13 +299,14 @@ def addFile(src, filetype): try: shutil.copy2(src, new_name) except IOError: + new_name = False sys.exit("Unable to move file to library dir " + params.folder+".") bibtexAppend(bibtex) - print("File " + src + " successfully imported.") + return new_name -def delete_id(ident): +def deleteId(ident): """ Delete a file based on its id in the bibtex file """ @@ -325,7 +327,7 @@ def delete_id(ident): return True -def delete_file(filename): +def deleteFile(filename): """ Delete a file based on its filename """ @@ -348,13 +350,41 @@ def delete_file(filename): return found +def downloadFile(url, filetype): + pdf = fetcher.download_url(url) + + if pdf is not False: + with open(params.folder+'tmp.pdf', 'w+') as fh: + fh.write(pdf) + new_name = addFile(params.folder+'tmp.pdf', filetype) + try: + os.remove(params.folder+'tmp.pdf') + except: + warning('Unable to delete temp file '+params.folder+'tmp.pdf') + return new_name + else: + warning("Could not fetch "+url) + return False + + if __name__ == '__main__': try: if len(sys.argv) < 2: sys.exit("Usage : TODO") if sys.argv[1] == 'download': - raise Exception('TODO') + if len(sys.argv) < 3: + sys.exit("Usage : " + sys.argv[0] + + " download FILE [article|book]") + + filetype = None + if len(sys.argv) > 3 and sys.argv[3] in ["article", "book"]: + filetype = sys.argv[3].lower() + + new_name = downloadFile(sys.argv[2], filetype) + if new_name is not False: + print(sys.argv[2]+" successfully imported as "+new_name) + sys.exit() if sys.argv[1] == 'import': if len(sys.argv) < 3: @@ -365,15 +395,17 @@ if __name__ == '__main__': if len(sys.argv) > 3 and sys.argv[3] in ["article", "book"]: filetype = sys.argv[3].lower() - addFile(sys.argv[2], filetype) + new_name = addFile(sys.argv[2], filetype) + if new_name is not False: + print("File " + src + " successfully imported as "+new_name+".") sys.exit() elif sys.argv[1] == 'delete': if len(sys.argv) < 3: sys.exit("Usage : " + sys.argv[0] + " delete FILE|ID") - if not delete_id(sys.argv[2]): - if not delete_file(sys.argv[2]): + if not deleteId(sys.argv[2]): + if not deleteFile(sys.argv[2]): warning("Unable to delete "+sys.argv[2]) sys.exit(1) diff --git a/translation-server b/translation-server deleted file mode 160000 index 4d35648..0000000 --- a/translation-server +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4d35648672c1ff2d2b6c61308ac7fcb684d63448