References extraction using CERMINE

This commit is contained in:
Lucas Verney 2016-01-24 22:23:41 +01:00
parent d4d0e97295
commit 8a905a9776

View File

@ -11,6 +11,7 @@ import xml.etree.ElementTree as ET
from requests.exceptions import RequestException from requests.exceptions import RequestException
from libbmc import tools
from libbmc.citations import plaintext from libbmc.citations import plaintext
@ -79,7 +80,7 @@ def cermine(pdf_file, force_API=False, override_local=None):
"java", "java",
"-cp", local, "-cp", local,
"pl.edu.icm.cermine.PdfNLMContentExtractor", "pl.edu.icm.cermine.PdfNLMContentExtractor",
"-path", pdf_file]) "-path", pdf_file]).decode("utf-8")
except (RequestException, except (RequestException,
subprocess.CalledProcessError, subprocess.CalledProcessError,
FileNotFoundError): FileNotFoundError):
@ -123,8 +124,21 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
the default location (``libbmc/external/cermine.jar``). the default location (``libbmc/external/cermine.jar``).
:returns: A dict of cleaned plaintext citations and their associated DOI. :returns: A dict of cleaned plaintext citations and their associated DOI.
""" """
# TODO # TODO:
pass # * Do not convert to plain text, but use the extra metadata from
# CERMINE
# Call CERMINE on the PDF file
cermine_output = cermine(pdf_file, force_API, override_local)
# Parse the resulting XML
root = ET.fromstring(cermine_output)
plaintext_references = [
# Remove extra whitespaces
tools.clean_whitespaces(
# Convert XML element to string, discarding any leading "[n]"
ET.tostring(e, method="text").decode("utf-8").replace(e.text, ""))
for e in root.iter("mixed-citation")]
# Call the plaintext methods to fetch DOIs
return plaintext.get_cited_DOIs(plaintext_references)
def grobid(pdf_file): def grobid(pdf_file):