Solve issue #19

This commit is contained in:
Phyks 2014-11-04 21:11:19 +01:00
parent 9f821a409c
commit a39c1d94d0
3 changed files with 74 additions and 72 deletions

12
bmc.py
View File

@ -102,9 +102,11 @@ def addFile(src, filetype, manual, autoconfirm, tag):
if not manual: if not manual:
try: try:
if filetype == 'article' or filetype is None: if filetype == 'article' or filetype is None:
doi = fetcher.findDOI(src) id_type, article_id = fetcher.findID(src)
if doi is False and (filetype == 'article' or filetype is None): if id_type == "DOI":
arxiv = fetcher.findArXivId(src) doi = article_id
elif id_type == "arXiv":
arxiv = article_id
if filetype == 'book' or (doi is False and arxiv is False and if filetype == 'book' or (doi is False and arxiv is False and
filetype is None): filetype is None):
@ -338,7 +340,7 @@ def resync():
break break
else: else:
if 'doi' in list(entry.keys()): if 'doi' in list(entry.keys()):
doi = fetcher.findDOI(filename) doi = fetcher.findID(filename, only=["DOI"])
if doi is not False and doi != entry['doi']: if doi is not False and doi != entry['doi']:
loop = tools.rawInput("Found DOI does not " + loop = tools.rawInput("Found DOI does not " +
"match bibtex entry " + "match bibtex entry " +
@ -346,7 +348,7 @@ def resync():
"? [y/N]") "? [y/N]")
loop = (loop.lower() != 'y') loop = (loop.lower() != 'y')
if 'Eprint' in list(entry.keys()): if 'Eprint' in list(entry.keys()):
arxiv = fetcher.findArXivId(filename) arxiv = fetcher.findID(filename, only=["arXiv"])
if arxiv is not False and arxiv != entry['Eprint']: if arxiv is not False and arxiv != entry['Eprint']:
loop = tools.rawInput("Found arXiv id does " + loop = tools.rawInput("Found arXiv id does " +
"not match bibtex " + "not match bibtex " +

View File

@ -180,13 +180,16 @@ clean_doi_re = re.compile('^/')
clean_doi_fabse_re = re.compile('^10.1096') clean_doi_fabse_re = re.compile('^10.1096')
clean_doi_jcb_re = re.compile('^10.1083') clean_doi_jcb_re = re.compile('^10.1083')
clean_doi_len_re = re.compile(r'\d\.\d') clean_doi_len_re = re.compile(r'\d\.\d')
arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
def findDOI(src): def findArticleID(src, only=["DOI", "arXiv"]):
"""Search for a valid DOI in src. """Search for a valid article ID (DOI or ArXiv) in src.
Returns the DOI or False if not found or an error occurred. Returns a tuple (type, first matching ID) or False if not found
or an error occurred.
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/ From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
""" """
if src.endswith(".pdf"): if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"], totext = subprocess.Popen(["pdftotext", src, "-"],
@ -197,33 +200,49 @@ def findDOI(src):
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE) stderr=subprocess.PIPE)
else: else:
return False return (False, False)
extractfull = '' extractfull = ''
extract_type = False
extractID = None
while totext.poll() is None: while totext.poll() is None:
extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()]) extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-')) # Try to extract DOI
if not extractDOI: if "DOI" in only:
# PNAS fix extractID = doi_re.search(extractfull.lower().replace('Œ', '-'))
extractDOI = doi_pnas_re.search(extractfull. if not extractID:
lower(). # PNAS fix
replace('pnas', '/pnas')) extractID = doi_pnas_re.search(extractfull.
if not extractDOI: lower().
# JSB fix replace('pnas', '/pnas'))
extractDOI = doi_jsb_re.search(extractfull.lower()) if not extractID:
if extractDOI: # JSB fix
totext.terminate() extractID = doi_jsb_re.search(extractfull.lower())
if extractID:
extract_type = "DOI"
totext.terminate()
# Try to extract arXiv
if "arXiv" in only:
tmp_extractID = arXiv_re.search(extractfull)
if tmp_extractID:
if not extractID or extractID.start(0) > tmp_extractID.start(1):
# Only use arXiv id if it is before the DOI in the pdf
extractID = tmp_extractID
extract_type = "arXiv"
totext.terminate()
if extract_type is not False:
break break
err = totext.communicate()[1] err = totext.communicate()[1]
if totext.returncode > 0: if totext.returncode > 0:
# Error happened # Error happened
tools.warning(err) tools.warning(err)
return False return (False, False)
cleanDOI = False if extractID is not None and extract_type == "DOI":
if extractDOI: # If DOI extracted, clean it and return it
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '') cleanDOI = False
cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
if clean_doi_re.search(cleanDOI): if clean_doi_re.search(cleanDOI):
cleanDOI = cleanDOI[1:] cleanDOI = cleanDOI[1:]
# FABSE J fix # FABSE J fix
@ -243,7 +262,11 @@ def findDOI(src):
if cleanDOItemp[i].isalpha() and digitStart: if cleanDOItemp[i].isalpha() and digitStart:
break break
cleanDOI = cleanDOI[0:(8+i)] cleanDOI = cleanDOI[0:(8+i)]
return cleanDOI return ("DOI", cleanDOI)
elif extractID is not None and extract_type == "arXiv":
# If arXiv id is extracted, return it
return ("arXiv", extractID.group(1))
return (False, False)
def doi2Bib(doi): def doi2Bib(doi):
@ -276,45 +299,6 @@ def doi2Bib(doi):
return '' return ''
arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
def findArXivId(src):
"""Searches for a valid arXiv id in src.
Returns the arXiv id or False if not found or an error occurred.
From : https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
"""
if src.endswith(".pdf"):
totext = subprocess.Popen(["pdftotext", src, "-"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
elif src.endswith(".djvu"):
totext = subprocess.Popen(["djvutxt", src],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
else:
return False
extractfull = ''
while totext.poll() is None:
extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
extractID = arXiv_re.search(extractfull)
if extractID:
totext.terminate()
break
err = totext.communicate()[1]
if totext.returncode > 0:
# Error happened
tools.warning(err)
return False
elif extractID is not None:
return extractID.group(1)
else:
return False
def arXiv2Bib(arxiv): def arXiv2Bib(arxiv):
"""Returns bibTeX string of metadata for a given arXiv id """Returns bibTeX string of metadata for a given arXiv id

View File

@ -50,16 +50,22 @@ class TestFetcher(unittest.TestCase):
self.assertEqual(isbn2Bib('foo'), '') self.assertEqual(isbn2Bib('foo'), '')
def test_findDOI_PDF(self): def test_findDOI_PDF(self):
self.assertEqual(findDOI("libbmc/tests/src/test.pdf"), self.assertEqual(findArticleID("libbmc/tests/src/test.pdf"),
"10.1103/physrevlett.112.253201") ("DOI", "10.1103/physrevlett.112.253201"))
def test_findDOI_DJVU(self): def test_findOnlyDOI(self):
self.assertEqual(findArticleID("libbmc/tests/src/test.pdf",
only=["DOI"]),
("DOI", "10.1103/physrevlett.112.253201"))
def test_findDOID_DJVU(self):
# DOI is incomplete in this test because my djvu file is bad # DOI is incomplete in this test because my djvu file is bad
self.assertEqual(findDOI("libbmc/tests/src/test.djvu"), self.assertEqual(findArticleID("libbmc/tests/src/test.djvu"),
"10.1103/physrevlett.112") ("DOI", "10.1103/physrevlett.112"))
def test_findDOI_False(self): def test_findDOI_False(self):
self.assertFalse(findDOI("libbmc/tests/src/test_arxiv_multi.pdf")) self.assertFalse(findArticleID("libbmc/tests/src/test_arxiv_multi.pdf",
only=["DOI"])[0])
def test_doi2Bib(self): def test_doi2Bib(self):
self.assertEqual(doi2Bib('10.1103/physreva.88.043630'), self.doi_bib) self.assertEqual(doi2Bib('10.1103/physreva.88.043630'), self.doi_bib)
@ -68,8 +74,18 @@ class TestFetcher(unittest.TestCase):
self.assertEqual(doi2Bib('blabla'), '') self.assertEqual(doi2Bib('blabla'), '')
def test_findArXivId(self): def test_findArXivId(self):
self.assertEqual(findArXivId("libbmc/tests/src/test_arxiv_multi.pdf"), self.assertEqual(findArticleID("libbmc/tests/src/test_arxiv_multi.pdf"),
'1303.3130v1') ("arXiv", '1303.3130v1'))
def test_findOnlyArXivId(self):
self.assertEqual(findArticleID("libbmc/tests/src/test_arxiv_multi.pdf",
only=["arXiv"]),
("arXiv", '1303.3130v1'))
def test_findArticleID(self):
# cf https://github.com/Phyks/BMC/issues/19
self.assertEqual(findArticleID("libbmc/tests/src/test_arxiv_doi_conflict.pdf"),
("arXiv", '1107.4487v1'))
def test_arXiv2Bib(self): def test_arXiv2Bib(self):
self.assertEqual(arXiv2Bib('1303.3130v1'), self.arxiv_bib) self.assertEqual(arXiv2Bib('1303.3130v1'), self.arxiv_bib)