diff --git a/README.md b/README.md
index 66c9091..8e21769 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,9 @@ BiblioManager will always use standard formats such as BibTeX, so that you can e
## Current status
* Able to import a PDF / djvu file, automagically find the DOI / ISBN, get the bibtex entry back and add it to the library. If DOI / ISBN search fails, it will prompt you for it.
+* Able to download a URL, using any specified proxy (you can list many and it will try all of them) and store the pdf file with its metadata.
+
+Should be almost working and usable now, although still to be considered as **experimental**.
**Important note :** I use it for personnal use, but I don't read articles from many journals. If you find any file which is not working, please fill an issue or send me an e-mail with the relevant information. There are alternative ways to get the metadata for example, and I didn't know really which one was the best one as writing this code.
@@ -32,29 +35,16 @@ TODO -- To be updated
Install pdfminer, pdfparanoia (via pip) and requesocks.
-Init the submodules and install Zotero translation server.
Copy params.py.example as params.py and customize it.
Install pdftotext.
Install djvulibre to use djvu files.
Install isbntools with pip.
-## Paperbot
-
-Paperbot is a command line utility that fetches academic papers. When given a URL on stdin or as a CLI argument, it fetches the content and returns a public link on stdout. This seems to help enhance the quality of discussion and make us less ignorant.
-
-All content is scraped using [zotero/translators](https://github.com/zotero/translators). These are javascript scrapers that work on a large number of academic publisher sites and are actively maintained. Paperbot offloads links to [zotero/translation-server](https://github.com/zotero/translation-server), which runs the zotero scrapers headlessly in a gecko and xulrunner environment. The scrapers return metadata and a link to the pdf. Then paperbot fetches that particular pdf. When given a link straight to a pdf, which paperbot is also happy to compulsively archive it.
-
-I kept part of the code to handle pdf downloading, and added a backend behind it.
-
-Paperbot can try multiple instances of translation-server (configured to use different ways to access content) and different SOCKS proxies to retrieve the content.
-
-
## Used source codes
-* [zotero/translators](https://github.com/zotero/translators) : Links finder
-* [zotero/translation-server](https://github.com/zotero/translation-server) : Links finder
* [pdfparanoia](https://github.com/kanzure/pdfparanoia) : Watermark removal
+* [paperbot](https://github.com/kanzure/paperbot) although my fetching of papers is way more basic
## License
@@ -71,6 +61,7 @@ TODO
A list of ideas and TODO. Don't hesitate to give feedback on the ones you really want or to propose your owns.
+* pdfparanoia to remove the watermarks on pdf files
* Webserver interface
* Various re.compile ?
* check output of subprocesses before it ends
diff --git a/fetcher.py b/fetcher.py
index b5ae66e..954aedb 100755
--- a/fetcher.py
+++ b/fetcher.py
@@ -1,462 +1,25 @@
#!/usr/bin/python2 -u
# coding=utf8
+
"""
Fetches papers.
"""
-import re
-import os
-import json
-import params
-import random
+
import requesocks as requests
-import lxml.etree
-import sys
-from time import time
-from StringIO import StringIO
+import params
-import pdfparanoia
+def download_url(url):
+ for proxy in params.proxies:
+ r_proxy = {
+ "http": proxy,
+ "https": proxy,
+ }
-def download_proxy(line, zotero, proxy, verbose=True):
- sys.stderr.write("attempting download of %s through %s and %s\n" %
- (line, zotero, proxy))
+ r = requests.get(url, proxies=r_proxy)
- headers = {
- "Content-Type": "application/json",
- }
-
- data = {
- "url": line,
- "sessionid": "what"
- }
-
- data = json.dumps(data)
-
- response = requests.post(zotero, data=data, headers=headers)
-
- if response.status_code != 200 or response.content == "[]":
- sys.stderr.write("no valid reply from zotero\n")
- sys.stderr.write("status %d\n" % response.status_code)
- sys.stderr.write("content %s\n" % response.content)
- return -1 # fatal
-
- sys.stderr.write("content %s\n" % response.content)
- # see if there are any attachments
- content = json.loads(response.content)
- item = content[0]
- title = item["title"]
-
- if not item.has_key("attachments"):
- sys.stderr.write("no attachement with this proxy\n")
- return 1 # try another proxy
-
- pdf_url = None
- for attachment in item["attachments"]:
- if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
- pdf_url = attachment["url"]
- break
-
- if not pdf_url:
- sys.stderr.write("no PDF attachement with this proxy\n")
- return 1 # try another proxy
-
- user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
-
- headers = {
- "User-Agent": user_agent,
- }
-
- sys.stderr.write("try retrieving " +
- str(pdf_url) + " through proxy " + proxy + "\n")
- response = None
- session = requests.Session()
- session.proxies = {
- 'http': proxy,
- 'https': proxy}
-
- try:
- if pdf_url.startswith("https://"):
- response = session.get(pdf_url, headers=headers, verify=False)
- else:
- response = session.get(pdf_url, headers=headers)
- except requests.exceptions.ConnectionError:
- sys.stderr.write("network failure on download " +
- str(pdf_url) + "\n")
- return 1
-
- # detect failure
- if response.status_code == 401:
- sys.stderr.write("HTTP 401 unauthorized when trying to fetch " +
- str(pdf_url) + "\n")
- return 1
- elif response.status_code != 200:
- sys.stderr.write("HTTP " + str(response.status_code)
- + " when trying to fetch " + str(pdf_url) + "\n")
- return 1
-
- data = response.content
-
- if "pdf" in response.headers["content-type"]:
- try:
- data = pdfparanoia.scrub(StringIO(data))
- except:
- # this is to avoid a PDFNotImplementedError
- pass
-
- # grr..
- title = title.encode("ascii", "ignore")
- title = title.replace(" ", "_")
- title = title[:params.maxlen]
-
- path = os.path.join(params.folder, title + ".pdf")
-
- file_handler = open(path, "w")
- file_handler.write(data)
- file_handler.close()
-
- filename = requests.utils.quote(title)
-
- # Remove an ending period, which sometimes happens when the
- # title of the paper has a period at the end.
- if filename[-1] == ".":
- filename = filename[:-1]
-
- url = params.url + filename + ".pdf"
-
- print(url)
- return 0
-
-
-def download(line, verbose=True):
- """
- Downloads a paper.
- """
-
- # don't bother if there's nothing there
- if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
- return
- for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
- line = filter_fix(line)
-
- # fix for login.jsp links to ieee xplore
- line = fix_ieee_login_urls(line)
- line = fix_jstor_pdf_urls(line)
-
- ok = False
-
- for (zotero, proxy) in params.servers:
- s = download_proxy(line, zotero, proxy, verbose)
- if s < 0:
- break
- if s == 0:
- ok = True
- break
- if not ok:
- for (zotero, proxy) in params.servers:
- s = download_url(line, proxy)
- sys.stderr.write("return code " + str(s) + "\n")
- if s == 0:
- ok = True
- break
- if not ok:
- s = download_url(line, params.servers[0][1], last_resort=True)
- if s != 0:
- print "couldn't get it at all :("
-
- return
-
-download.commands = ["fetch", "get", "download"]
-download.priority = "high"
-download.rule = r'(.*)'
-
-def download_ieee(url):
- """
- Downloads an IEEE paper. The Zotero translator requires frames/windows to
- be available. Eventually translation-server will be fixed, but until then
- it might be nice to have an IEEE workaround.
- """
- # url = "http://ieeexplore.ieee.org:80/xpl/freeabs_all.jsp?reload=true&arnumber=901261"
- # url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261"
- raise NotImplementedError
-
-def download_url(url, proxy, last_resort=False):
- sys.stderr.write("attempting direct for %s through %s\n" % (url,
- proxy))
-
- session = requests.Session()
- session.proxies = {
- 'http': proxy,
- 'https': proxy}
-
- try:
- response = session.get(url, headers={"User-Agent": "origami-pdf"})
- except requests.exceptions.ConnectionError:
- sys.stderr.write("network failure on download " +
- str(url) + "\n")
- return 1
-
- content = response.content
-
- # just make up a default filename
- title = "%0.2x" % random.getrandbits(128)
-
- # default extension
- extension = ".txt"
-
- if "pdf" in response.headers["content-type"]:
- extension = ".pdf"
- elif check_if_html(response):
- # parse the html string with lxml.etree
- tree = parse_html(content)
-
- # extract some metadata with xpaths
- citation_pdf_url = find_citation_pdf_url(tree, url)
- citation_title = find_citation_title(tree)
-
- # aip.org sucks, citation_pdf_url is wrong
- if citation_pdf_url and "link.aip.org/" in citation_pdf_url:
- citation_pdf_url = None
-
- if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url:
- content = session.get(citation_pdf_url).content
- tree = parse_html(content)
- # citation_title = ...
-
- # wow, this seriously needs to be cleaned up
- if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url:
- citation_title = citation_title.encode("ascii", "ignore")
- response = session.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
- content = response.content
- if "pdf" in response.headers["content-type"]:
- extension = ".pdf"
- title = citation_title
- else:
- if "sciencedirect.com" in url and not "ShoppingCart" in url:
- try:
- title = tree.xpath("//h1[@class='svTitle']")[0].text
- pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
- new_response = session.get(pdf_url, headers={"User-Agent": "sdf-macross"})
- new_content = new_response.content
- if "pdf" in new_response.headers["content-type"]:
- extension = ".pdf"
- except Exception:
- pass
- else:
- content = new_content
- response = new_response
- elif "jstor.org/" in url:
- # clean up the url
- if "?" in url:
- url = url[0:url.find("?")]
-
- # not all pages have the element
- try:
- title = tree.xpath("//div[@class='hd title']")[0].text
- except Exception:
- try:
- title = tree.xpath("//input[@name='ppv-title']/@value")[0]
- except Exception:
- pass
-
- # get the document id
- document_id = None
- if url[-1] != "/":
- #if "stable/" in url:
- #elif "discover/" in url:
- #elif "action/showShelf?candidate=" in url:
- #elif "pss/" in url:
- document_id = url.split("/")[-1]
-
- if document_id.isdigit():
- try:
- pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true"
- new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.1"})
- new_content = new_response.content
- if "pdf" in new_response.headers["content-type"]:
- extension = ".pdf"
- except Exception:
- pass
- else:
- content = new_content
- response = new_response
- elif ".aip.org/" in url:
- try:
- title = tree.xpath("//title/text()")[0].split(" | ")[0]
- pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]
- new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
- new_content = new_response.content
- if "pdf" in new_response.headers["content-type"]:
- extension = ".pdf"
- except Exception:
- pass
- else:
- content = new_content
- response = new_response
- elif "ieeexplore.ieee.org" in url:
- try:
- pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0]
- new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/2.0"})
- new_content = new_response.content
- if "pdf" in new_response.headers["content-type"]:
- extension = ".pdf"
- except Exception:
- pass
- else:
- content = new_content
- response = new_response
- elif "h1 class=\"articleTitle" in content:
- try:
- title = tree.xpath("//h1[@class='articleTitle']")[0].text
- title = title.encode("ascii", "ignore")
- pdf_url = tree.xpath("//a[@title='View the Full Text PDF']/@href")[0]
- except:
- pass
- else:
- if pdf_url.startswith("/"):
- url_start = url[:url.find("/",8)]
- pdf_url = url_start + pdf_url
- response = session.get(pdf_url, headers={"User-Agent": "pdf-teapot"})
- content = response.content
- if "pdf" in response.headers["content-type"]:
- extension = ".pdf"
- # raise Exception("problem with citation_pdf_url or citation_title")
- # well, at least save the contents from the original url
- pass
-
- # make the title again just in case
- if not title:
- title = "%0.2x" % random.getrandbits(128)
-
- # can't create directories
- title = title.replace("/", "_")
- title = title.replace(" ", "_")
- title = title[:params.maxlen]
-
- path = os.path.join(params.folder, title + extension)
-
- if extension in [".pdf", "pdf"]:
- try:
- sys.stderr.write("got it! " +
- str(url) + "\n")
- content = pdfparanoia.scrub(StringIO(content))
- except:
- # this is to avoid a PDFNotImplementedError
- pass
-
- file_handler = open(path, "w")
- file_handler.write(content)
- file_handler.close()
-
- title = title.encode("ascii", "ignore")
- url = params.url + requests.utils.quote(title) + extension
-
- if extension in [".pdf", "pdf"]:
- print url
- return 0
- else:
- sys.stderr.write("couldn't find it, dump: %s\n" % url)
- if last_resort:
- print "couldn't find it, dump: %s" % url
- else:
- return 1
- return 0
-
-
-def parse_html(content):
- if not isinstance(content, StringIO):
- content = StringIO(content)
- parser = lxml.etree.HTMLParser()
- tree = lxml.etree.parse(content, parser)
- return tree
-
-def check_if_html(response):
- return "text/html" in response.headers["content-type"]
-
-def find_citation_pdf_url(tree, url):
- """
- Returns the content attribute.
- """
- citation_pdf_url = extract_meta_content(tree, "citation_pdf_url")
- if citation_pdf_url and not citation_pdf_url.startswith("http"):
- if citation_pdf_url.startswith("/"):
- url_start = url[:url.find("/",8)]
- citation_pdf_url = url_start + citation_pdf_url
- else:
- raise Exception("unhandled situation (citation_pdf_url)")
- return citation_pdf_url
-
-def find_citation_title(tree):
- """
- Returns the content attribute.
- """
- citation_title = extract_meta_content(tree, "citation_title")
- return citation_title
-
-def extract_meta_content(tree, meta_name):
- try:
- content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0]
- except:
- return None
- else:
- return content
-
-def filter_fix(url):
- """
- Fixes some common problems in urls.
- """
- if ".proxy.lib.pdx.edu" in url:
- url = url.replace(".proxy.lib.pdx.edu", "")
- return url
-
-def fix_ieee_login_urls(url):
- """
- Fixes urls point to login.jsp on IEEE Xplore. When someone browses to the
- abstracts page on IEEE Xplore, they are sometimes sent to the login.jsp
- page, and then this link is given to paperbot. The actual link is based on
- the arnumber.
-
- example:
- http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=806324&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D806324
- """
- if "ieeexplore.ieee.org/xpl/login.jsp" in url:
- if "arnumber=" in url:
- parts = url.split("arnumber=")
-
- # i guess the url might not look like the example in the docstring
- if "&" in parts[1]:
- arnumber = parts[1].split("&")[0]
- else:
- arnumber = parts[1]
-
- return "http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=" + arnumber
-
- # default case when things go wrong
- return url
-
-def fix_jstor_pdf_urls(url):
- """
- Fixes urls pointing to jstor pdfs.
- """
- if "jstor.org/" in url:
- if ".pdf" in url and not "?acceptTC=true" in url:
- url += "?acceptTC=true"
- return url
-
-if __name__ == '__main__':
- if len(sys.argv) > 1:
- for a in sys.argv[1:]:
- download(a)
- else:
- reqs = []
- while True:
- l = sys.stdin.readline()
- if not l:
- break
- reqs.append(time())
- if len(reqs) > params.thresh:
- delay = time() - reqs[len(reqs) - params.thresh + 1]
- if params.limit - delay > 0:
- print "rate limit exceeded, try again in %d second(s)" % (params.limit - delay)
- else:
- download(l)
+ if r.status_code != 200 or 'pdf' not in r.headers['content-type']:
+ continue
+ return r.content
+ return False
diff --git a/main.py b/main.py
index 06e567f..2b6c9fc 100755
--- a/main.py
+++ b/main.py
@@ -3,6 +3,7 @@
from __future__ import print_function
+import fetcher
import sys
import shutil
import requests
@@ -298,13 +299,14 @@ def addFile(src, filetype):
try:
shutil.copy2(src, new_name)
except IOError:
+ new_name = False
sys.exit("Unable to move file to library dir " + params.folder+".")
bibtexAppend(bibtex)
- print("File " + src + " successfully imported.")
+ return new_name
-def delete_id(ident):
+def deleteId(ident):
"""
Delete a file based on its id in the bibtex file
"""
@@ -325,7 +327,7 @@ def delete_id(ident):
return True
-def delete_file(filename):
+def deleteFile(filename):
"""
Delete a file based on its filename
"""
@@ -348,13 +350,41 @@ def delete_file(filename):
return found
+def downloadFile(url, filetype):
+ pdf = fetcher.download_url(url)
+
+ if pdf is not False:
+ with open(params.folder+'tmp.pdf', 'w+') as fh:
+ fh.write(pdf)
+ new_name = addFile(params.folder+'tmp.pdf', filetype)
+ try:
+ os.remove(params.folder+'tmp.pdf')
+ except:
+ warning('Unable to delete temp file '+params.folder+'tmp.pdf')
+ return new_name
+ else:
+ warning("Could not fetch "+url)
+ return False
+
+
if __name__ == '__main__':
try:
if len(sys.argv) < 2:
sys.exit("Usage : TODO")
if sys.argv[1] == 'download':
- raise Exception('TODO')
+ if len(sys.argv) < 3:
+ sys.exit("Usage : " + sys.argv[0] +
+ " download FILE [article|book]")
+
+ filetype = None
+ if len(sys.argv) > 3 and sys.argv[3] in ["article", "book"]:
+ filetype = sys.argv[3].lower()
+
+ new_name = downloadFile(sys.argv[2], filetype)
+ if new_name is not False:
+ print(sys.argv[2]+" successfully imported as "+new_name)
+ sys.exit()
if sys.argv[1] == 'import':
if len(sys.argv) < 3:
@@ -365,15 +395,17 @@ if __name__ == '__main__':
if len(sys.argv) > 3 and sys.argv[3] in ["article", "book"]:
filetype = sys.argv[3].lower()
- addFile(sys.argv[2], filetype)
+ new_name = addFile(sys.argv[2], filetype)
+ if new_name is not False:
+ print("File " + src + " successfully imported as "+new_name+".")
sys.exit()
elif sys.argv[1] == 'delete':
if len(sys.argv) < 3:
sys.exit("Usage : " + sys.argv[0] + " delete FILE|ID")
- if not delete_id(sys.argv[2]):
- if not delete_file(sys.argv[2]):
+ if not deleteId(sys.argv[2]):
+ if not deleteFile(sys.argv[2]):
warning("Unable to delete "+sys.argv[2])
sys.exit(1)
diff --git a/translation-server b/translation-server
deleted file mode 160000
index 4d35648..0000000
--- a/translation-server
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 4d35648672c1ff2d2b6c61308ac7fcb684d63448