Solve issue #19
This commit is contained in:
parent
9f821a409c
commit
a39c1d94d0
12
bmc.py
12
bmc.py
@ -102,9 +102,11 @@ def addFile(src, filetype, manual, autoconfirm, tag):
|
|||||||
if not manual:
|
if not manual:
|
||||||
try:
|
try:
|
||||||
if filetype == 'article' or filetype is None:
|
if filetype == 'article' or filetype is None:
|
||||||
doi = fetcher.findDOI(src)
|
id_type, article_id = fetcher.findID(src)
|
||||||
if doi is False and (filetype == 'article' or filetype is None):
|
if id_type == "DOI":
|
||||||
arxiv = fetcher.findArXivId(src)
|
doi = article_id
|
||||||
|
elif id_type == "arXiv":
|
||||||
|
arxiv = article_id
|
||||||
|
|
||||||
if filetype == 'book' or (doi is False and arxiv is False and
|
if filetype == 'book' or (doi is False and arxiv is False and
|
||||||
filetype is None):
|
filetype is None):
|
||||||
@ -338,7 +340,7 @@ def resync():
|
|||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
if 'doi' in list(entry.keys()):
|
if 'doi' in list(entry.keys()):
|
||||||
doi = fetcher.findDOI(filename)
|
doi = fetcher.findID(filename, only=["DOI"])
|
||||||
if doi is not False and doi != entry['doi']:
|
if doi is not False and doi != entry['doi']:
|
||||||
loop = tools.rawInput("Found DOI does not " +
|
loop = tools.rawInput("Found DOI does not " +
|
||||||
"match bibtex entry " +
|
"match bibtex entry " +
|
||||||
@ -346,7 +348,7 @@ def resync():
|
|||||||
"? [y/N]")
|
"? [y/N]")
|
||||||
loop = (loop.lower() != 'y')
|
loop = (loop.lower() != 'y')
|
||||||
if 'Eprint' in list(entry.keys()):
|
if 'Eprint' in list(entry.keys()):
|
||||||
arxiv = fetcher.findArXivId(filename)
|
arxiv = fetcher.findID(filename, only=["arXiv"])
|
||||||
if arxiv is not False and arxiv != entry['Eprint']:
|
if arxiv is not False and arxiv != entry['Eprint']:
|
||||||
loop = tools.rawInput("Found arXiv id does " +
|
loop = tools.rawInput("Found arXiv id does " +
|
||||||
"not match bibtex " +
|
"not match bibtex " +
|
||||||
|
@ -180,13 +180,16 @@ clean_doi_re = re.compile('^/')
|
|||||||
clean_doi_fabse_re = re.compile('^10.1096')
|
clean_doi_fabse_re = re.compile('^10.1096')
|
||||||
clean_doi_jcb_re = re.compile('^10.1083')
|
clean_doi_jcb_re = re.compile('^10.1083')
|
||||||
clean_doi_len_re = re.compile(r'\d\.\d')
|
clean_doi_len_re = re.compile(r'\d\.\d')
|
||||||
|
arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
def findDOI(src):
|
def findArticleID(src, only=["DOI", "arXiv"]):
|
||||||
"""Search for a valid DOI in src.
|
"""Search for a valid article ID (DOI or ArXiv) in src.
|
||||||
|
|
||||||
Returns the DOI or False if not found or an error occurred.
|
Returns a tuple (type, first matching ID) or False if not found
|
||||||
|
or an error occurred.
|
||||||
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
|
From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
|
||||||
|
and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
|
||||||
"""
|
"""
|
||||||
if src.endswith(".pdf"):
|
if src.endswith(".pdf"):
|
||||||
totext = subprocess.Popen(["pdftotext", src, "-"],
|
totext = subprocess.Popen(["pdftotext", src, "-"],
|
||||||
@ -197,33 +200,49 @@ def findDOI(src):
|
|||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE)
|
stderr=subprocess.PIPE)
|
||||||
else:
|
else:
|
||||||
return False
|
return (False, False)
|
||||||
|
|
||||||
extractfull = ''
|
extractfull = ''
|
||||||
|
extract_type = False
|
||||||
|
extractID = None
|
||||||
while totext.poll() is None:
|
while totext.poll() is None:
|
||||||
extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
|
extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
|
||||||
extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-'))
|
# Try to extract DOI
|
||||||
if not extractDOI:
|
if "DOI" in only:
|
||||||
# PNAS fix
|
extractID = doi_re.search(extractfull.lower().replace('Œ', '-'))
|
||||||
extractDOI = doi_pnas_re.search(extractfull.
|
if not extractID:
|
||||||
lower().
|
# PNAS fix
|
||||||
replace('pnas', '/pnas'))
|
extractID = doi_pnas_re.search(extractfull.
|
||||||
if not extractDOI:
|
lower().
|
||||||
# JSB fix
|
replace('pnas', '/pnas'))
|
||||||
extractDOI = doi_jsb_re.search(extractfull.lower())
|
if not extractID:
|
||||||
if extractDOI:
|
# JSB fix
|
||||||
totext.terminate()
|
extractID = doi_jsb_re.search(extractfull.lower())
|
||||||
|
if extractID:
|
||||||
|
extract_type = "DOI"
|
||||||
|
totext.terminate()
|
||||||
|
# Try to extract arXiv
|
||||||
|
if "arXiv" in only:
|
||||||
|
tmp_extractID = arXiv_re.search(extractfull)
|
||||||
|
if tmp_extractID:
|
||||||
|
if not extractID or extractID.start(0) > tmp_extractID.start(1):
|
||||||
|
# Only use arXiv id if it is before the DOI in the pdf
|
||||||
|
extractID = tmp_extractID
|
||||||
|
extract_type = "arXiv"
|
||||||
|
totext.terminate()
|
||||||
|
if extract_type is not False:
|
||||||
break
|
break
|
||||||
|
|
||||||
err = totext.communicate()[1]
|
err = totext.communicate()[1]
|
||||||
if totext.returncode > 0:
|
if totext.returncode > 0:
|
||||||
# Error happened
|
# Error happened
|
||||||
tools.warning(err)
|
tools.warning(err)
|
||||||
return False
|
return (False, False)
|
||||||
|
|
||||||
cleanDOI = False
|
if extractID is not None and extract_type == "DOI":
|
||||||
if extractDOI:
|
# If DOI extracted, clean it and return it
|
||||||
cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
|
cleanDOI = False
|
||||||
|
cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
|
||||||
if clean_doi_re.search(cleanDOI):
|
if clean_doi_re.search(cleanDOI):
|
||||||
cleanDOI = cleanDOI[1:]
|
cleanDOI = cleanDOI[1:]
|
||||||
# FABSE J fix
|
# FABSE J fix
|
||||||
@ -243,7 +262,11 @@ def findDOI(src):
|
|||||||
if cleanDOItemp[i].isalpha() and digitStart:
|
if cleanDOItemp[i].isalpha() and digitStart:
|
||||||
break
|
break
|
||||||
cleanDOI = cleanDOI[0:(8+i)]
|
cleanDOI = cleanDOI[0:(8+i)]
|
||||||
return cleanDOI
|
return ("DOI", cleanDOI)
|
||||||
|
elif extractID is not None and extract_type == "arXiv":
|
||||||
|
# If arXiv id is extracted, return it
|
||||||
|
return ("arXiv", extractID.group(1))
|
||||||
|
return (False, False)
|
||||||
|
|
||||||
|
|
||||||
def doi2Bib(doi):
|
def doi2Bib(doi):
|
||||||
@ -276,45 +299,6 @@ def doi2Bib(doi):
|
|||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
||||||
arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)
|
|
||||||
|
|
||||||
|
|
||||||
def findArXivId(src):
|
|
||||||
"""Searches for a valid arXiv id in src.
|
|
||||||
|
|
||||||
Returns the arXiv id or False if not found or an error occurred.
|
|
||||||
From : https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
|
|
||||||
"""
|
|
||||||
if src.endswith(".pdf"):
|
|
||||||
totext = subprocess.Popen(["pdftotext", src, "-"],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE)
|
|
||||||
elif src.endswith(".djvu"):
|
|
||||||
totext = subprocess.Popen(["djvutxt", src],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE)
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
extractfull = ''
|
|
||||||
while totext.poll() is None:
|
|
||||||
extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
|
|
||||||
extractID = arXiv_re.search(extractfull)
|
|
||||||
if extractID:
|
|
||||||
totext.terminate()
|
|
||||||
break
|
|
||||||
|
|
||||||
err = totext.communicate()[1]
|
|
||||||
if totext.returncode > 0:
|
|
||||||
# Error happened
|
|
||||||
tools.warning(err)
|
|
||||||
return False
|
|
||||||
elif extractID is not None:
|
|
||||||
return extractID.group(1)
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def arXiv2Bib(arxiv):
|
def arXiv2Bib(arxiv):
|
||||||
"""Returns bibTeX string of metadata for a given arXiv id
|
"""Returns bibTeX string of metadata for a given arXiv id
|
||||||
|
|
||||||
|
@ -50,16 +50,22 @@ class TestFetcher(unittest.TestCase):
|
|||||||
self.assertEqual(isbn2Bib('foo'), '')
|
self.assertEqual(isbn2Bib('foo'), '')
|
||||||
|
|
||||||
def test_findDOI_PDF(self):
|
def test_findDOI_PDF(self):
|
||||||
self.assertEqual(findDOI("libbmc/tests/src/test.pdf"),
|
self.assertEqual(findArticleID("libbmc/tests/src/test.pdf"),
|
||||||
"10.1103/physrevlett.112.253201")
|
("DOI", "10.1103/physrevlett.112.253201"))
|
||||||
|
|
||||||
def test_findDOI_DJVU(self):
|
def test_findOnlyDOI(self):
|
||||||
|
self.assertEqual(findArticleID("libbmc/tests/src/test.pdf",
|
||||||
|
only=["DOI"]),
|
||||||
|
("DOI", "10.1103/physrevlett.112.253201"))
|
||||||
|
|
||||||
|
def test_findDOID_DJVU(self):
|
||||||
# DOI is incomplete in this test because my djvu file is bad
|
# DOI is incomplete in this test because my djvu file is bad
|
||||||
self.assertEqual(findDOI("libbmc/tests/src/test.djvu"),
|
self.assertEqual(findArticleID("libbmc/tests/src/test.djvu"),
|
||||||
"10.1103/physrevlett.112")
|
("DOI", "10.1103/physrevlett.112"))
|
||||||
|
|
||||||
def test_findDOI_False(self):
|
def test_findDOI_False(self):
|
||||||
self.assertFalse(findDOI("libbmc/tests/src/test_arxiv_multi.pdf"))
|
self.assertFalse(findArticleID("libbmc/tests/src/test_arxiv_multi.pdf",
|
||||||
|
only=["DOI"])[0])
|
||||||
|
|
||||||
def test_doi2Bib(self):
|
def test_doi2Bib(self):
|
||||||
self.assertEqual(doi2Bib('10.1103/physreva.88.043630'), self.doi_bib)
|
self.assertEqual(doi2Bib('10.1103/physreva.88.043630'), self.doi_bib)
|
||||||
@ -68,8 +74,18 @@ class TestFetcher(unittest.TestCase):
|
|||||||
self.assertEqual(doi2Bib('blabla'), '')
|
self.assertEqual(doi2Bib('blabla'), '')
|
||||||
|
|
||||||
def test_findArXivId(self):
|
def test_findArXivId(self):
|
||||||
self.assertEqual(findArXivId("libbmc/tests/src/test_arxiv_multi.pdf"),
|
self.assertEqual(findArticleID("libbmc/tests/src/test_arxiv_multi.pdf"),
|
||||||
'1303.3130v1')
|
("arXiv", '1303.3130v1'))
|
||||||
|
|
||||||
|
def test_findOnlyArXivId(self):
|
||||||
|
self.assertEqual(findArticleID("libbmc/tests/src/test_arxiv_multi.pdf",
|
||||||
|
only=["arXiv"]),
|
||||||
|
("arXiv", '1303.3130v1'))
|
||||||
|
|
||||||
|
def test_findArticleID(self):
|
||||||
|
# cf https://github.com/Phyks/BMC/issues/19
|
||||||
|
self.assertEqual(findArticleID("libbmc/tests/src/test_arxiv_doi_conflict.pdf"),
|
||||||
|
("arXiv", '1107.4487v1'))
|
||||||
|
|
||||||
def test_arXiv2Bib(self):
|
def test_arXiv2Bib(self):
|
||||||
self.assertEqual(arXiv2Bib('1303.3130v1'), self.arxiv_bib)
|
self.assertEqual(arXiv2Bib('1303.3130v1'), self.arxiv_bib)
|
||||||
|
Loading…
Reference in New Issue
Block a user