Updated test files

According to https://github.com/Phyks/BMC/issues/7.

Also updated fetcher file to fix two bugs :
* Whitespaces in ISBN
* If PDF to text (or djvu to text) is not long enough, the end of the
file was not processed.
This commit is contained in:
Phyks 2014-06-29 23:02:44 +02:00
parent bc15f86057
commit b231b578cc
5 changed files with 16 additions and 8 deletions

View File

@ -67,7 +67,7 @@ def download(url):
return False return False
isbn_re = re.compile(r'isbn ((?:[0-9]{3}[ -]?)?[0-9]{1,5}[ -]?[0-9]{1,7}[ -]?[0-9]{1,6}[- ]?[0-9])', isbn_re = re.compile(r'isbn[\s]?:?[\s]?((?:[0-9]{3}[ -]?)?[0-9]{1,5}[ -]?[0-9]{1,7}[ -]?[0-9]{1,6}[- ]?[0-9])',
re.IGNORECASE) re.IGNORECASE)
@ -89,7 +89,7 @@ def findISBN(src):
return False return False
while totext.poll() is None: while totext.poll() is None:
extractfull = totext.stdout.readline() extractfull = ' '.join([i.strip() for i in totext.stdout.readlines()])
extractISBN = isbn_re.search(extractfull.lower().replace('Œ', extractISBN = isbn_re.search(extractfull.lower().replace('Œ',
'-')) '-'))
if extractISBN: if extractISBN:
@ -146,7 +146,7 @@ def findDOI(src):
extractfull = '' extractfull = ''
while totext.poll() is None: while totext.poll() is None:
extractfull += "".join([i.strip() for i in totext.stdout.readlines()]) extractfull += ' '.join([i.strip() for i in totext.stdout.readlines()])
extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-')) extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-'))
if not extractDOI: if not extractDOI:
# PNAS fix # PNAS fix
@ -232,7 +232,7 @@ def findArXivId(src):
extractfull = '' extractfull = ''
while totext.poll() is None: while totext.poll() is None:
extractfull += totext.stdout.readline().strip() extractfull += ' '.join([i.strip() for i in totext.stdout.readlines()])
extractID = arXiv_re.search(extractfull) extractID = arXiv_re.search(extractfull)
if extractID: if extractID:
totext.terminate() totext.terminate()
@ -291,7 +291,7 @@ def findHALId(src):
return False return False
while totext.poll() is None: while totext.poll() is None:
extractfull = totext.stdout.readline() extractfull = ' '.join([i.strip() for i in totext.stdout.readlines()])
extractID = HAL_re.search(extractfull) extractID = HAL_re.search(extractfull)
if extractID: if extractID:
totext.terminate() totext.terminate()

7
tests/src/isbn.bib Normal file
View File

@ -0,0 +1,7 @@
@book{0198507194,
title = {Bose-Einstein Condensation},
author = {Lev. P. Pitaevskii and S. Stringari},
isbn = {0198507194},
year = {2004},
publisher = {Clarendon Press}
}

BIN
tests/src/test_book.djvu Normal file

Binary file not shown.

BIN
tests/src/test_book.pdf Normal file

Binary file not shown.

View File

@ -34,10 +34,11 @@ class TestFetcher(unittest.TestCase):
self.assertFalse(download('a')) self.assertFalse(download('a'))
def test_findISBN_DJVU(self): def test_findISBN_DJVU(self):
self.assertEqual(findISBN("tests/src/test_book.djvu"), '0198507194') # ISBN is incomplete in this test because my djvu file is bad
self.assertEqual(findISBN("tests/src/test_book.djvu"), '978295391873')
def test_findISBN_PDF(self): def test_findISBN_PDF(self):
self.assertEqual(findISBN("tests/src/test_book.pdf"), '9780521846516') self.assertEqual(findISBN("tests/src/test_book.pdf"), '9782953918731')
def test_findISBN_False(self): def test_findISBN_False(self):
self.assertFalse(findISBN("tests/src/test.pdf")) self.assertFalse(findISBN("tests/src/test.pdf"))
@ -53,7 +54,7 @@ class TestFetcher(unittest.TestCase):
"10.1103/physrevlett.112.253201") "10.1103/physrevlett.112.253201")
def test_findDOI_DJVU(self): def test_findDOI_DJVU(self):
# DOI is incomplete in this text because my djvu file is bad # DOI is incomplete in this test because my djvu file is bad
self.assertEqual(findDOI("tests/src/test.djvu"), self.assertEqual(findDOI("tests/src/test.djvu"),
"10.1103/physrevlett.112") "10.1103/physrevlett.112")