Working on PDF import

* Search the PDF file for DOI, manual fallback if not found * Move the PDF file * Add its Bibtex entry to the general bibtex file TODO : * Better renaming * Adding to bibtex file
2014-04-24 16:18:56 +02:00 · 2014-04-24 16:18:56 +02:00 · ea53e0720f
commit ea53e0720f
parent 93d1fefa26
1 changed files with 92 additions and 24 deletions
--- a/main.py
+++ b/main.py
@ -7,13 +7,20 @@ Main app
 import sys
 import shutil
 import requests
 import subprocess
 import re
 try:
    from cStringIO import StringIO
 except:
    from StringIO import StringIO
 from bibtexparser.bparser import BibTexParser
 import params
-def bibtex_append(data):
+def bibtexAppend(data):
    """
    Append data to the main bibtex file
    data is a dict as the one from bibtexparser output
    """
    bibtex = ''
    for field, value in data:
@ -22,43 +29,106 @@ def bibtex_append(data):
    # TODO : Write
-def add_file(src, doi):
+def replaceAll(text, dic):
-    """
+    for i, j in dic.iteritems():
-    Add a file to the library
+        text = text.replace(i, j)
-    """
+    return text
    new_name = folder+"/"+doi
    try:
        shutil.copy2(src, new_name)
    except IOError:
        sys.exit("Unable to move file to library dir " + folder)
    data = {"file": new_name}
-    bibtex_append(data)
+def PDF2Doi(pdf):
    pdftotext = subprocess.Popen(["pdftotext", pdf, "-"],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
    extractfull = pdftotext.communicate()
    if extractfull[1] is not "":
        return False
-    print("File " + src + " successfully imported.")
+    extractfull = extractfull[0]
    extractDOI = re.search('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]',
                           extractfull.lower().replace('&#338;', '-'))
    if not extractDOI:
        # PNAS fix
        extractDOI = re.search('(?<=doi).?10.1073/pnas\.\d+',
                               extractfull.lower().replace('pnas', '/pnas'))
        if not extractDOI:
            # JSB fix
            extractDOI = re.search('10\.1083/jcb\.\d{9}', extractfull.lower())
- 
+    cleanDOI = False
-def doi2bib(doi):
+    if extractDOI:
        cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
        if re.search('^/', cleanDOI):
            cleanDOI = cleanDOI[1:]
        # FABSE J fix
        if re.search('^10.1096', cleanDOI):
            cleanDOI = cleanDOI[:20]
        # Second JCB fix
        if re.search('^10.1083', cleanDOI):
            cleanDOI = cleanDOI[:21]
        if len(cleanDOI) > 40:
            cleanDOItemp = re.sub(r'\d\.\d', '000', cleanDOI)
            reps = {'.': 'A', '-': '0'}
            cleanDOItemp = replaceAll(cleanDOItemp[8:], reps)
            digitStart = 0
            for i in range(len(cleanDOItemp)):
                if cleanDOItemp[i].isdigit():
                    digitStart = 1
                    if cleanDOItemp[i].isalpha() and digitStart:
                        break
            cleanDOI = cleanDOI[0:(8+i)]
    return cleanDOI
 def doi2Bib(doi):
    """
    Return a bibTeX string of metadata for a given DOI.
    From : https://gist.github.com/jrsmith3/5513926
    """
    url = "http://dx.doi.org/" + doi
    headers = {"accept": "application/x-bibtex"}
-    r = requests.get(url, headers = headers)
+    r = requests.get(url, headers=headers)
    return r.text
 def addFile(src):
    """
    Add a file to the library
    """
    doi = PDF2Doi(src)
    if doi is False:
        print("Could not determine the DOI for "+src+", switching to manual " +
              "entry.")
        doi = raw_input('DOI ? ')
    else:
        print("DOI for "+src+" is "+doi+".")
    bibtex = doi2Bib(doi).strip().replace(',', ",\n")
    bibtex = StringIO(bibtex)
    bibtex = BibTexParser(bibtex).get_entry_dict()
    # TODO : Rename
    new_name = params.folder+"/"+doi
    bibtex[bibtex.keys()[0]]['file'] = new_name
    try:
        shutil.copy2(src, new_name)
    except IOError:
        sys.exit("Unable to move file to library dir " + params.folder+".")
    bibtexAppend(bibtex)
    print("File " + src + " successfully imported.")
 if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit("Usage : TODO")
    if sys.argv[1] == 'download':
        raise Exception('TODO')
@ -66,9 +136,7 @@ if __name__ == '__main__':
        if len(sys.argv) < 3:
            sys.exit("Usage : " + sys.argv[0] + " import FILE")
-        doi = raw_input('DOI ? ')
+        addFile(sys.argv[2])
        # TODO : Get DOI automagically
        add_file(sys.argv[2], doi)
        sys.exit()
    elif sys.argv[1] == 'list':