diff --git a/bmc.py b/bmc.py index 1ee3ce8..91a1de5 100755 --- a/bmc.py +++ b/bmc.py @@ -102,9 +102,11 @@ def addFile(src, filetype, manual, autoconfirm, tag): if not manual: try: if filetype == 'article' or filetype is None: - doi = fetcher.findDOI(src) - if doi is False and (filetype == 'article' or filetype is None): - arxiv = fetcher.findArXivId(src) + id_type, article_id = fetcher.findID(src) + if id_type == "DOI": + doi = article_id + elif id_type == "arXiv": + arxiv = article_id if filetype == 'book' or (doi is False and arxiv is False and filetype is None): @@ -338,7 +340,7 @@ def resync(): break else: if 'doi' in list(entry.keys()): - doi = fetcher.findDOI(filename) + doi = fetcher.findID(filename, only=["DOI"]) if doi is not False and doi != entry['doi']: loop = tools.rawInput("Found DOI does not " + "match bibtex entry " + @@ -346,7 +348,7 @@ def resync(): "? [y/N]") loop = (loop.lower() != 'y') if 'Eprint' in list(entry.keys()): - arxiv = fetcher.findArXivId(filename) + arxiv = fetcher.findID(filename, only=["arXiv"]) if arxiv is not False and arxiv != entry['Eprint']: loop = tools.rawInput("Found arXiv id does " + "not match bibtex " + diff --git a/libbmc/fetcher.py b/libbmc/fetcher.py index 863fc4a..f120bd6 100644 --- a/libbmc/fetcher.py +++ b/libbmc/fetcher.py @@ -180,13 +180,16 @@ clean_doi_re = re.compile('^/') clean_doi_fabse_re = re.compile('^10.1096') clean_doi_jcb_re = re.compile('^10.1083') clean_doi_len_re = re.compile(r'\d\.\d') +arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE) -def findDOI(src): - """Search for a valid DOI in src. +def findArticleID(src, only=["DOI", "arXiv"]): + """Search for a valid article ID (DOI or ArXiv) in src. - Returns the DOI or False if not found or an error occurred. + Returns a tuple (type, first matching ID) or False if not found + or an error occurred. From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/ + and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb """ if src.endswith(".pdf"): totext = subprocess.Popen(["pdftotext", src, "-"], @@ -197,33 +200,49 @@ def findDOI(src): stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: - return False + return (False, False) extractfull = '' + extract_type = False + extractID = None while totext.poll() is None: extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()]) - extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-')) - if not extractDOI: - # PNAS fix - extractDOI = doi_pnas_re.search(extractfull. - lower(). - replace('pnas', '/pnas')) - if not extractDOI: - # JSB fix - extractDOI = doi_jsb_re.search(extractfull.lower()) - if extractDOI: - totext.terminate() + # Try to extract DOI + if "DOI" in only: + extractID = doi_re.search(extractfull.lower().replace('Œ', '-')) + if not extractID: + # PNAS fix + extractID = doi_pnas_re.search(extractfull. + lower(). + replace('pnas', '/pnas')) + if not extractID: + # JSB fix + extractID = doi_jsb_re.search(extractfull.lower()) + if extractID: + extract_type = "DOI" + totext.terminate() + # Try to extract arXiv + if "arXiv" in only: + tmp_extractID = arXiv_re.search(extractfull) + if tmp_extractID: + if not extractID or extractID.start(0) > tmp_extractID.start(1): + # Only use arXiv id if it is before the DOI in the pdf + extractID = tmp_extractID + extract_type = "arXiv" + totext.terminate() + if extract_type is not False: break err = totext.communicate()[1] if totext.returncode > 0: # Error happened tools.warning(err) - return False + return (False, False) - cleanDOI = False - if extractDOI: - cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '') + if extractID is not None and extract_type == "DOI": + # If DOI extracted, clean it and return it + cleanDOI = False + cleanDOI = extractID.group(0).replace(':', '').replace(' ', '') if clean_doi_re.search(cleanDOI): cleanDOI = cleanDOI[1:] # FABSE J fix @@ -243,7 +262,11 @@ def findDOI(src): if cleanDOItemp[i].isalpha() and digitStart: break cleanDOI = cleanDOI[0:(8+i)] - return cleanDOI + return ("DOI", cleanDOI) + elif extractID is not None and extract_type == "arXiv": + # If arXiv id is extracted, return it + return ("arXiv", extractID.group(1)) + return (False, False) def doi2Bib(doi): @@ -276,45 +299,6 @@ def doi2Bib(doi): return '' -arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE) - - -def findArXivId(src): - """Searches for a valid arXiv id in src. - - Returns the arXiv id or False if not found or an error occurred. - From : https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb - """ - if src.endswith(".pdf"): - totext = subprocess.Popen(["pdftotext", src, "-"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - elif src.endswith(".djvu"): - totext = subprocess.Popen(["djvutxt", src], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - else: - return False - - extractfull = '' - while totext.poll() is None: - extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()]) - extractID = arXiv_re.search(extractfull) - if extractID: - totext.terminate() - break - - err = totext.communicate()[1] - if totext.returncode > 0: - # Error happened - tools.warning(err) - return False - elif extractID is not None: - return extractID.group(1) - else: - return False - - def arXiv2Bib(arxiv): """Returns bibTeX string of metadata for a given arXiv id diff --git a/libbmc/tests/test_fetcher.py b/libbmc/tests/test_fetcher.py index 23f30f2..3fbf9fb 100644 --- a/libbmc/tests/test_fetcher.py +++ b/libbmc/tests/test_fetcher.py @@ -50,16 +50,22 @@ class TestFetcher(unittest.TestCase): self.assertEqual(isbn2Bib('foo'), '') def test_findDOI_PDF(self): - self.assertEqual(findDOI("libbmc/tests/src/test.pdf"), - "10.1103/physrevlett.112.253201") + self.assertEqual(findArticleID("libbmc/tests/src/test.pdf"), + ("DOI", "10.1103/physrevlett.112.253201")) - def test_findDOI_DJVU(self): + def test_findOnlyDOI(self): + self.assertEqual(findArticleID("libbmc/tests/src/test.pdf", + only=["DOI"]), + ("DOI", "10.1103/physrevlett.112.253201")) + + def test_findDOID_DJVU(self): # DOI is incomplete in this test because my djvu file is bad - self.assertEqual(findDOI("libbmc/tests/src/test.djvu"), - "10.1103/physrevlett.112") + self.assertEqual(findArticleID("libbmc/tests/src/test.djvu"), + ("DOI", "10.1103/physrevlett.112")) def test_findDOI_False(self): - self.assertFalse(findDOI("libbmc/tests/src/test_arxiv_multi.pdf")) + self.assertFalse(findArticleID("libbmc/tests/src/test_arxiv_multi.pdf", + only=["DOI"])[0]) def test_doi2Bib(self): self.assertEqual(doi2Bib('10.1103/physreva.88.043630'), self.doi_bib) @@ -68,8 +74,18 @@ class TestFetcher(unittest.TestCase): self.assertEqual(doi2Bib('blabla'), '') def test_findArXivId(self): - self.assertEqual(findArXivId("libbmc/tests/src/test_arxiv_multi.pdf"), - '1303.3130v1') + self.assertEqual(findArticleID("libbmc/tests/src/test_arxiv_multi.pdf"), + ("arXiv", '1303.3130v1')) + + def test_findOnlyArXivId(self): + self.assertEqual(findArticleID("libbmc/tests/src/test_arxiv_multi.pdf", + only=["arXiv"]), + ("arXiv", '1303.3130v1')) + + def test_findArticleID(self): + # cf https://github.com/Phyks/BMC/issues/19 + self.assertEqual(findArticleID("libbmc/tests/src/test_arxiv_doi_conflict.pdf"), + ("arXiv", '1107.4487v1')) def test_arXiv2Bib(self): self.assertEqual(arXiv2Bib('1303.3130v1'), self.arxiv_bib)