Working on PDF import

* Search the PDF file for DOI, manual fallback if not found * Move the PDF file * Add its Bibtex entry to the general bibtex file TODO : * Better renaming * Adding to bibtex file
2014-04-24 16:18:56 +02:00 · 2014-04-24 16:18:56 +02:00 · ea53e0720f
commit ea53e0720f
parent 93d1fefa26
1 changed files with 92 additions and 24 deletions
--- a/main.py
+++ b/main.py
@ -7,13 +7,20 @@ Main app
 import sys
 import shutil
 import requests
+import subprocess
+import re
+try:
+    from cStringIO import StringIO
+except:
+    from StringIO import StringIO
 from bibtexparser.bparser import BibTexParser
 import params


-def bibtex_append(data):
+def bibtexAppend(data):
    """
    Append data to the main bibtex file
+    data is a dict as the one from bibtexparser output
    """
    bibtex = ''
    for field, value in data:
@ -22,43 +29,106 @@ def bibtex_append(data):
    # TODO : Write


-def add_file(src, doi):
-    """
-    Add a file to the library
-    """
-    new_name = folder+"/"+doi
+def replaceAll(text, dic):
+    for i, j in dic.iteritems():
+        text = text.replace(i, j)
+    return text

-    try:
-        shutil.copy2(src, new_name)
-    except IOError:
-        sys.exit("Unable to move file to library dir " + folder)
-    
-    data = {"file": new_name}

-    bibtex_append(data)
+def PDF2Doi(pdf):
+    pdftotext = subprocess.Popen(["pdftotext", pdf, "-"],
+                                 stdout=subprocess.PIPE,
+                                 stderr=subprocess.PIPE)
+    extractfull = pdftotext.communicate()
+    if extractfull[1] is not "":
+        return False

-    print("File " + src + " successfully imported.")
+    extractfull = extractfull[0]
+    extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]',
+                           extractfull.lower().replace('&#338;', '-'))
+    if not extractDOI:
+        # PNAS fix
+        extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+',
+                               extractfull.lower().replace('pnas', '/pnas'))
+        if not extractDOI:
+            # JSB fix
+            extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower())

- 
-def doi2bib(doi):
+    cleanDOI = False
+    if extractDOI:
+        cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
+        if re.search('^/', cleanDOI):
+            cleanDOI = cleanDOI[1:]
+
+        # FABSE J fix
+        if re.search('^10.1096', cleanDOI):
+            cleanDOI = cleanDOI[:20]
+
+        # Second JCB fix
+        if re.search('^10.1083', cleanDOI):
+            cleanDOI = cleanDOI[:21]
+
+        if len(cleanDOI) > 40:
+            cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI)
+            reps = {'.': 'A', '-': '0'}
+            cleanDOItemp = replaceAll(cleanDOItemp[8:], reps)
+            digitStart = 0
+            for i in range(len(cleanDOItemp)):
+                if cleanDOItemp[i].isdigit():
+                    digitStart = 1
+                    if cleanDOItemp[i].isalpha() and digitStart:
+                        break
+            cleanDOI = cleanDOI[0:(8+i)]
+
+    return cleanDOI
+
+
+def doi2Bib(doi):
    """
    Return a bibTeX string of metadata for a given DOI.
    From : https://gist.github.com/jrsmith3/5513926
    """
-    
    url = "http://dx.doi.org/" + doi
-    
    headers = {"accept": "application/x-bibtex"}
-    r = requests.get(url, headers = headers)
-    
+    r = requests.get(url, headers=headers)
    return r.text


+def addFile(src):
+    """
+    Add a file to the library
+    """
+    doi = PDF2Doi(src)
+
+    if doi is False:
+        print("Could not determine the DOI for "+src+", switching to manual " +
+              "entry.")
+        doi = raw_input('DOI ? ')
+    else:
+        print("DOI for "+src+" is "+doi+".")
+
+    bibtex = doi2Bib(doi).strip().replace(',', ",\n")
+    bibtex = StringIO(bibtex)
+    bibtex = BibTexParser(bibtex).get_entry_dict()
+
+    # TODO : Rename
+    new_name = params.folder+"/"+doi
+
+    bibtex[bibtex.keys()[0]]['file'] = new_name
+
+    try:
+        shutil.copy2(src, new_name)
+    except IOError:
+        sys.exit("Unable to move file to library dir " + params.folder+".")
+
+    bibtexAppend(bibtex)
+    print("File " + src + " successfully imported.")
+
+
 if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit("Usage : TODO")

-
    if sys.argv[1] == 'download':
        raise Exception('TODO')

@ -66,9 +136,7 @@ if __name__ == '__main__':
        if len(sys.argv) < 3:
            sys.exit("Usage : " + sys.argv[0] + " import FILE")

-        doi = raw_input('DOI ? ')
-        # TODO : Get DOI automagically
-        add_file(sys.argv[2], doi)
+        addFile(sys.argv[2])
        sys.exit()

    elif sys.argv[1] == 'list':