References extraction using CERMINE
This commit is contained in:
parent
d4d0e97295
commit
8a905a9776
@ -11,6 +11,7 @@ import xml.etree.ElementTree as ET
|
|||||||
|
|
||||||
from requests.exceptions import RequestException
|
from requests.exceptions import RequestException
|
||||||
|
|
||||||
|
from libbmc import tools
|
||||||
from libbmc.citations import plaintext
|
from libbmc.citations import plaintext
|
||||||
|
|
||||||
|
|
||||||
@ -79,7 +80,7 @@ def cermine(pdf_file, force_API=False, override_local=None):
|
|||||||
"java",
|
"java",
|
||||||
"-cp", local,
|
"-cp", local,
|
||||||
"pl.edu.icm.cermine.PdfNLMContentExtractor",
|
"pl.edu.icm.cermine.PdfNLMContentExtractor",
|
||||||
"-path", pdf_file])
|
"-path", pdf_file]).decode("utf-8")
|
||||||
except (RequestException,
|
except (RequestException,
|
||||||
subprocess.CalledProcessError,
|
subprocess.CalledProcessError,
|
||||||
FileNotFoundError):
|
FileNotFoundError):
|
||||||
@ -123,8 +124,21 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
|
|||||||
the default location (``libbmc/external/cermine.jar``).
|
the default location (``libbmc/external/cermine.jar``).
|
||||||
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
||||||
"""
|
"""
|
||||||
# TODO
|
# TODO:
|
||||||
pass
|
# * Do not convert to plain text, but use the extra metadata from
|
||||||
|
# CERMINE
|
||||||
|
# Call CERMINE on the PDF file
|
||||||
|
cermine_output = cermine(pdf_file, force_API, override_local)
|
||||||
|
# Parse the resulting XML
|
||||||
|
root = ET.fromstring(cermine_output)
|
||||||
|
plaintext_references = [
|
||||||
|
# Remove extra whitespaces
|
||||||
|
tools.clean_whitespaces(
|
||||||
|
# Convert XML element to string, discarding any leading "[n]"
|
||||||
|
ET.tostring(e, method="text").decode("utf-8").replace(e.text, ""))
|
||||||
|
for e in root.iter("mixed-citation")]
|
||||||
|
# Call the plaintext methods to fetch DOIs
|
||||||
|
return plaintext.get_cited_DOIs(plaintext_references)
|
||||||
|
|
||||||
|
|
||||||
def grobid(pdf_file):
|
def grobid(pdf_file):
|
||||||
|
Loading…
Reference in New Issue
Block a user