From ea53e0720f175919d68c1bc497ad5130300fcb47 Mon Sep 17 00:00:00 2001 From: Phyks Date: Thu, 24 Apr 2014 16:18:56 +0200 Subject: [PATCH] Working on PDF import * Search the PDF file for DOI, manual fallback if not found * Move the PDF file * Add its Bibtex entry to the general bibtex file TODO : * Better renaming * Adding to bibtex file --- main.py | 116 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 92 insertions(+), 24 deletions(-) diff --git a/main.py b/main.py index 18761c2..a425838 100755 --- a/main.py +++ b/main.py @@ -7,13 +7,20 @@ Main app import sys import shutil import requests +import subprocess +import re +try: + from cStringIO import StringIO +except: + from StringIO import StringIO from bibtexparser.bparser import BibTexParser import params -def bibtex_append(data): +def bibtexAppend(data): """ Append data to the main bibtex file + data is a dict as the one from bibtexparser output """ bibtex = '' for field, value in data: @@ -22,43 +29,106 @@ def bibtex_append(data): # TODO : Write -def add_file(src, doi): - """ - Add a file to the library - """ - new_name = folder+"/"+doi +def replaceAll(text, dic): + for i, j in dic.iteritems(): + text = text.replace(i, j) + return text - try: - shutil.copy2(src, new_name) - except IOError: - sys.exit("Unable to move file to library dir " + folder) - - data = {"file": new_name} - bibtex_append(data) +def PDF2Doi(pdf): + pdftotext = subprocess.Popen(["pdftotext", pdf, "-"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + extractfull = pdftotext.communicate() + if extractfull[1] is not "": + return False - print("File " + src + " successfully imported.") + extractfull = extractfull[0] + extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', + extractfull.lower().replace('Œ', '-')) + if not extractDOI: + # PNAS fix + extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+', + extractfull.lower().replace('pnas', '/pnas')) + if not extractDOI: + # JSB fix + extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower()) - -def doi2bib(doi): + cleanDOI = False + if extractDOI: + cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '') + if re.search('^/', cleanDOI): + cleanDOI = cleanDOI[1:] + + # FABSE J fix + if re.search('^10.1096', cleanDOI): + cleanDOI = cleanDOI[:20] + + # Second JCB fix + if re.search('^10.1083', cleanDOI): + cleanDOI = cleanDOI[:21] + + if len(cleanDOI) > 40: + cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI) + reps = {'.': 'A', '-': '0'} + cleanDOItemp = replaceAll(cleanDOItemp[8:], reps) + digitStart = 0 + for i in range(len(cleanDOItemp)): + if cleanDOItemp[i].isdigit(): + digitStart = 1 + if cleanDOItemp[i].isalpha() and digitStart: + break + cleanDOI = cleanDOI[0:(8+i)] + + return cleanDOI + + +def doi2Bib(doi): """ Return a bibTeX string of metadata for a given DOI. From : https://gist.github.com/jrsmith3/5513926 """ - url = "http://dx.doi.org/" + doi - headers = {"accept": "application/x-bibtex"} - r = requests.get(url, headers = headers) - + r = requests.get(url, headers=headers) return r.text +def addFile(src): + """ + Add a file to the library + """ + doi = PDF2Doi(src) + + if doi is False: + print("Could not determine the DOI for "+src+", switching to manual " + + "entry.") + doi = raw_input('DOI ? ') + else: + print("DOI for "+src+" is "+doi+".") + + bibtex = doi2Bib(doi).strip().replace(',', ",\n") + bibtex = StringIO(bibtex) + bibtex = BibTexParser(bibtex).get_entry_dict() + + # TODO : Rename + new_name = params.folder+"/"+doi + + bibtex[bibtex.keys()[0]]['file'] = new_name + + try: + shutil.copy2(src, new_name) + except IOError: + sys.exit("Unable to move file to library dir " + params.folder+".") + + bibtexAppend(bibtex) + print("File " + src + " successfully imported.") + + if __name__ == '__main__': if len(sys.argv) < 2: sys.exit("Usage : TODO") - if sys.argv[1] == 'download': raise Exception('TODO') @@ -66,9 +136,7 @@ if __name__ == '__main__': if len(sys.argv) < 3: sys.exit("Usage : " + sys.argv[0] + " import FILE") - doi = raw_input('DOI ? ') - # TODO : Get DOI automagically - add_file(sys.argv[2], doi) + addFile(sys.argv[2]) sys.exit() elif sys.argv[1] == 'list':