Add functions to extract references from a PDF file

Add some functions to extract references from a PDF file. They are
basically wrappers around Cermine, Grobid and pdf-extract. The Grobid
wrapper is still to be done and more deeply embedded in the toolchain.
This commit is contained in:
Lucas Verney 2016-01-19 17:45:58 +01:00
parent 962b4adc23
commit e9d7f3ad78

126
libbmc/citations/pdf.py Normal file
View File

@ -0,0 +1,126 @@
"""
This files contains all the functions to extract DOIs of citations from
PDF files.
"""
import requests
import subprocess
import xml.etree.ElementTree as ET
from requests.exceptions import RequestException
from libbmc.citations import plaintext
CERMINE_BASE_URL = "http://cermine.ceon.pl/"
def cermine(pdf_file):
"""
Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract procedure on \
the given PDF file, to retrieve citations (and more) from the \
provided PDF file.
.. note::
This uses the `CERMINE API <http://cermine.ceon.pl/about.html>`_, and \
hence, uploads the PDF file (so uses network). Check out \
the CERMINE API terms.
:param pdf_file: Path to the PDF file to handle.
:returns: Raw output from CERMINE API or ``None`` if an error occurred. \
No post-processing is done.
"""
try:
with open(pdf_file, "rb") as fh:
r = requests.post(
CERMINE_BASE_URL + "extract.do",
headers={"Content-Type": "application/binary"},
files={"file": fh}
)
return r.text
except (RequestException, FileNotFoundError):
return None
def grobid(pdf_file):
"""
Run `Grobid <https://github.com/kermitt2/grobid>`_ on a given PDF file to \
extract references.
.. note::
Before using this function, you have to download and build Grobid on \
your system. See \
`<https://grobid.readthedocs.org/en/latest/Install-Grobid/>`_ \
for more infos on this. You need Java and \
``grobid-core-`<current version>`.one-jar.jar`` to be in your \
``$PATH``.
:param pdf_file: Path to the PDF file to handle.
:returns: Raw output from ``Grobid`` or ``None`` if an error occurred.
"""
# TODO + update docstring
# TODO: Use https://github.com/kermitt2/grobid-example
subprocess.check_output(["java",
"-jar", "grobid-core-0.3.0.one-jar.jar",
"-Xmx1024m", # Avoid OutOfMemoryException
"-gH", "/path/to/Grobid/grobid/grobid-home",
"-gP", "/path/to/Grobid/grobid-home/config/grobid.properties",
"-dIn", "/path/to/input/directory",
"-dOut", "/path/to/output/directory",
"-exe", "processReferences"])
def pdfextract(pdf_file):
"""
Run `pdfextract <https://github.com/CrossRef/pdfextract>`_ on a given PDF \
file to extract references.
.. note::
Before using this function, you have to install pdfextract on \
your system. See \
`<https://github.com/CrossRef/pdfextract#quick-start>`_ \
for more infos on this. You need the ``pdf-extract`` command \
to be in your ``$PATH``. This can be done easily using \
``gem install pdf-extract``, provided that you have a correct \
Ruby install on your system.
:param pdf_file: Path to the PDF file to handle.
:returns: Raw output from ``pdfextract`` or ``None`` if an error \
occurred. No post-processing is done. See \
``libbmc.citations.pdf.pdfextract_dois`` for a similar function \
with post-processing to return DOIs.
"""
# Run pdf-extract
try:
references = subprocess.check_output(["pdf-extract",
"extract", "--references",
pdf_file])
return references
except subprocess.CalledProcessError:
return None
def pdfextract_dois(pdf_file):
"""
Extract DOIs of references using \
`pdfextract <https://github.com/CrossRef/pdfextract>`_.
.. note::
See ``libbmc.citations.pdf.pdfextract`` function as this one is just \
a wrapper around it.
See ``libbmc.citations.plaintext.get_cited_dois`` as well for the \
returned value, as it is ultimately called by this function.
:param pdf_file: Path to the PDF file to handle.
:returns: A dict of cleaned plaintext citations and their associated DOI.
"""
# Call pdf-extract on the PDF file
references = pdfextract(pdf_file)
# Parse the resulting XML
root = ET.fromstring(references)
plaintext_references = [e.text for e in root.iter("reference")]
# Call the plaintext methods to fetch DOIs
return plaintext.get_cited_DOIs(plaintext_references)