Working on PDF import
* Search the PDF file for DOI, manual fallback if not found * Move the PDF file * Add its Bibtex entry to the general bibtex file TODO : * Better renaming * Adding to bibtex file
This commit is contained in:
parent
93d1fefa26
commit
ea53e0720f
118
main.py
118
main.py
@ -7,13 +7,20 @@ Main app
|
|||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
import requests
|
import requests
|
||||||
|
import subprocess
|
||||||
|
import re
|
||||||
|
try:
|
||||||
|
from cStringIO import StringIO
|
||||||
|
except:
|
||||||
|
from StringIO import StringIO
|
||||||
from bibtexparser.bparser import BibTexParser
|
from bibtexparser.bparser import BibTexParser
|
||||||
import params
|
import params
|
||||||
|
|
||||||
|
|
||||||
def bibtex_append(data):
|
def bibtexAppend(data):
|
||||||
"""
|
"""
|
||||||
Append data to the main bibtex file
|
Append data to the main bibtex file
|
||||||
|
data is a dict as the one from bibtexparser output
|
||||||
"""
|
"""
|
||||||
bibtex = ''
|
bibtex = ''
|
||||||
for field, value in data:
|
for field, value in data:
|
||||||
@ -22,43 +29,106 @@ def bibtex_append(data):
|
|||||||
# TODO : Write
|
# TODO : Write
|
||||||
|
|
||||||
|
|
||||||
def add_file(src, doi):
|
def replaceAll(text, dic):
|
||||||
"""
|
for i, j in dic.iteritems():
|
||||||
Add a file to the library
|
text = text.replace(i, j)
|
||||||
"""
|
return text
|
||||||
new_name = folder+"/"+doi
|
|
||||||
|
|
||||||
try:
|
|
||||||
shutil.copy2(src, new_name)
|
|
||||||
except IOError:
|
|
||||||
sys.exit("Unable to move file to library dir " + folder)
|
|
||||||
|
|
||||||
data = {"file": new_name}
|
|
||||||
|
|
||||||
bibtex_append(data)
|
|
||||||
|
|
||||||
print("File " + src + " successfully imported.")
|
|
||||||
|
|
||||||
|
|
||||||
def doi2bib(doi):
|
def PDF2Doi(pdf):
|
||||||
|
pdftotext = subprocess.Popen(["pdftotext", pdf, "-"],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE)
|
||||||
|
extractfull = pdftotext.communicate()
|
||||||
|
if extractfull[1] is not "":
|
||||||
|
return False
|
||||||
|
|
||||||
|
extractfull = extractfull[0]
|
||||||
|
extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]',
|
||||||
|
extractfull.lower().replace('Œ', '-'))
|
||||||
|
if not extractDOI:
|
||||||
|
# PNAS fix
|
||||||
|
extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+',
|
||||||
|
extractfull.lower().replace('pnas', '/pnas'))
|
||||||
|
if not extractDOI:
|
||||||
|
# JSB fix
|
||||||
|
extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower())
|
||||||
|
|
||||||
|
cleanDOI = False
|
||||||
|
if extractDOI:
|
||||||
|
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
|
||||||
|
if re.search('^/', cleanDOI):
|
||||||
|
cleanDOI = cleanDOI[1:]
|
||||||
|
|
||||||
|
# FABSE J fix
|
||||||
|
if re.search('^10.1096', cleanDOI):
|
||||||
|
cleanDOI = cleanDOI[:20]
|
||||||
|
|
||||||
|
# Second JCB fix
|
||||||
|
if re.search('^10.1083', cleanDOI):
|
||||||
|
cleanDOI = cleanDOI[:21]
|
||||||
|
|
||||||
|
if len(cleanDOI) > 40:
|
||||||
|
cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI)
|
||||||
|
reps = {'.': 'A', '-': '0'}
|
||||||
|
cleanDOItemp = replaceAll(cleanDOItemp[8:], reps)
|
||||||
|
digitStart = 0
|
||||||
|
for i in range(len(cleanDOItemp)):
|
||||||
|
if cleanDOItemp[i].isdigit():
|
||||||
|
digitStart = 1
|
||||||
|
if cleanDOItemp[i].isalpha() and digitStart:
|
||||||
|
break
|
||||||
|
cleanDOI = cleanDOI[0:(8+i)]
|
||||||
|
|
||||||
|
return cleanDOI
|
||||||
|
|
||||||
|
|
||||||
|
def doi2Bib(doi):
|
||||||
"""
|
"""
|
||||||
Return a bibTeX string of metadata for a given DOI.
|
Return a bibTeX string of metadata for a given DOI.
|
||||||
From : https://gist.github.com/jrsmith3/5513926
|
From : https://gist.github.com/jrsmith3/5513926
|
||||||
"""
|
"""
|
||||||
|
|
||||||
url = "http://dx.doi.org/" + doi
|
url = "http://dx.doi.org/" + doi
|
||||||
|
|
||||||
headers = {"accept": "application/x-bibtex"}
|
headers = {"accept": "application/x-bibtex"}
|
||||||
r = requests.get(url, headers=headers)
|
r = requests.get(url, headers=headers)
|
||||||
|
|
||||||
return r.text
|
return r.text
|
||||||
|
|
||||||
|
|
||||||
|
def addFile(src):
|
||||||
|
"""
|
||||||
|
Add a file to the library
|
||||||
|
"""
|
||||||
|
doi = PDF2Doi(src)
|
||||||
|
|
||||||
|
if doi is False:
|
||||||
|
print("Could not determine the DOI for "+src+", switching to manual " +
|
||||||
|
"entry.")
|
||||||
|
doi = raw_input('DOI ? ')
|
||||||
|
else:
|
||||||
|
print("DOI for "+src+" is "+doi+".")
|
||||||
|
|
||||||
|
bibtex = doi2Bib(doi).strip().replace(',', ",\n")
|
||||||
|
bibtex = StringIO(bibtex)
|
||||||
|
bibtex = BibTexParser(bibtex).get_entry_dict()
|
||||||
|
|
||||||
|
# TODO : Rename
|
||||||
|
new_name = params.folder+"/"+doi
|
||||||
|
|
||||||
|
bibtex[bibtex.keys()[0]]['file'] = new_name
|
||||||
|
|
||||||
|
try:
|
||||||
|
shutil.copy2(src, new_name)
|
||||||
|
except IOError:
|
||||||
|
sys.exit("Unable to move file to library dir " + params.folder+".")
|
||||||
|
|
||||||
|
bibtexAppend(bibtex)
|
||||||
|
print("File " + src + " successfully imported.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
sys.exit("Usage : TODO")
|
sys.exit("Usage : TODO")
|
||||||
|
|
||||||
|
|
||||||
if sys.argv[1] == 'download':
|
if sys.argv[1] == 'download':
|
||||||
raise Exception('TODO')
|
raise Exception('TODO')
|
||||||
|
|
||||||
@ -66,9 +136,7 @@ if __name__ == '__main__':
|
|||||||
if len(sys.argv) < 3:
|
if len(sys.argv) < 3:
|
||||||
sys.exit("Usage : " + sys.argv[0] + " import FILE")
|
sys.exit("Usage : " + sys.argv[0] + " import FILE")
|
||||||
|
|
||||||
doi = raw_input('DOI ? ')
|
addFile(sys.argv[2])
|
||||||
# TODO : Get DOI automagically
|
|
||||||
add_file(sys.argv[2], doi)
|
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
elif sys.argv[1] == 'list':
|
elif sys.argv[1] == 'list':
|
||||||
|
Loading…
Reference in New Issue
Block a user