Working on PDF import

* Search the PDF file for DOI, manual fallback if not found
* Move the PDF file
* Add its Bibtex entry to the general bibtex file

TODO :
* Better renaming
* Adding to bibtex file
This commit is contained in:
Phyks 2014-04-24 16:18:56 +02:00
parent 93d1fefa26
commit ea53e0720f

120
main.py
View File

@ -7,13 +7,20 @@ Main app
import sys import sys
import shutil import shutil
import requests import requests
import subprocess
import re
try:
from cStringIO import StringIO
except:
from StringIO import StringIO
from bibtexparser.bparser import BibTexParser from bibtexparser.bparser import BibTexParser
import params import params
def bibtex_append(data): def bibtexAppend(data):
""" """
Append data to the main bibtex file Append data to the main bibtex file
data is a dict as the one from bibtexparser output
""" """
bibtex = '' bibtex = ''
for field, value in data: for field, value in data:
@ -22,43 +29,106 @@ def bibtex_append(data):
# TODO : Write # TODO : Write
def add_file(src, doi): def replaceAll(text, dic):
""" for i, j in dic.iteritems():
Add a file to the library text = text.replace(i, j)
""" return text
new_name = folder+"/"+doi
try:
shutil.copy2(src, new_name)
except IOError:
sys.exit("Unable to move file to library dir " + folder)
data = {"file": new_name}
bibtex_append(data)
print("File " + src + " successfully imported.")
def doi2bib(doi): def PDF2Doi(pdf):
pdftotext = subprocess.Popen(["pdftotext", pdf, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
extractfull = pdftotext.communicate()
if extractfull[1] is not "":
return False
extractfull = extractfull[0]
extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]',
extractfull.lower().replace('&#338;', '-'))
if not extractDOI:
# PNAS fix
extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+',
extractfull.lower().replace('pnas', '/pnas'))
if not extractDOI:
# JSB fix
extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower())
cleanDOI = False
if extractDOI:
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
if re.search('^/', cleanDOI):
cleanDOI = cleanDOI[1:]
# FABSE J fix
if re.search('^10.1096', cleanDOI):
cleanDOI = cleanDOI[:20]
# Second JCB fix
if re.search('^10.1083', cleanDOI):
cleanDOI = cleanDOI[:21]
if len(cleanDOI) > 40:
cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI)
reps = {'.': 'A', '-': '0'}
cleanDOItemp = replaceAll(cleanDOItemp[8:], reps)
digitStart = 0
for i in range(len(cleanDOItemp)):
if cleanDOItemp[i].isdigit():
digitStart = 1
if cleanDOItemp[i].isalpha() and digitStart:
break
cleanDOI = cleanDOI[0:(8+i)]
return cleanDOI
def doi2Bib(doi):
""" """
Return a bibTeX string of metadata for a given DOI. Return a bibTeX string of metadata for a given DOI.
From : https://gist.github.com/jrsmith3/5513926 From : https://gist.github.com/jrsmith3/5513926
""" """
url = "http://dx.doi.org/" + doi url = "http://dx.doi.org/" + doi
headers = {"accept": "application/x-bibtex"} headers = {"accept": "application/x-bibtex"}
r = requests.get(url, headers = headers) r = requests.get(url, headers=headers)
return r.text return r.text
def addFile(src):
"""
Add a file to the library
"""
doi = PDF2Doi(src)
if doi is False:
print("Could not determine the DOI for "+src+", switching to manual " +
"entry.")
doi = raw_input('DOI ? ')
else:
print("DOI for "+src+" is "+doi+".")
bibtex = doi2Bib(doi).strip().replace(',', ",\n")
bibtex = StringIO(bibtex)
bibtex = BibTexParser(bibtex).get_entry_dict()
# TODO : Rename
new_name = params.folder+"/"+doi
bibtex[bibtex.keys()[0]]['file'] = new_name
try:
shutil.copy2(src, new_name)
except IOError:
sys.exit("Unable to move file to library dir " + params.folder+".")
bibtexAppend(bibtex)
print("File " + src + " successfully imported.")
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) < 2: if len(sys.argv) < 2:
sys.exit("Usage : TODO") sys.exit("Usage : TODO")
if sys.argv[1] == 'download': if sys.argv[1] == 'download':
raise Exception('TODO') raise Exception('TODO')
@ -66,9 +136,7 @@ if __name__ == '__main__':
if len(sys.argv) < 3: if len(sys.argv) < 3:
sys.exit("Usage : " + sys.argv[0] + " import FILE") sys.exit("Usage : " + sys.argv[0] + " import FILE")
doi = raw_input('DOI ? ') addFile(sys.argv[2])
# TODO : Get DOI automagically
add_file(sys.argv[2], doi)
sys.exit() sys.exit()
elif sys.argv[1] == 'list': elif sys.argv[1] == 'list':