Wrapper around Grobid

This commit is contained in:
Lucas Verney 2016-01-30 16:28:21 +01:00
parent f5183a1d11
commit a2875fc242
1 changed files with 24 additions and 17 deletions

View File

@ -141,9 +141,9 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
return plaintext.get_cited_DOIs(plaintext_references) return plaintext.get_cited_DOIs(plaintext_references)
def grobid(pdf_file): def grobid(pdf_folder, grobid_home=None, grobid_jar=None):
""" """
Run `Grobid <https://github.com/kermitt2/grobid>`_ on a given PDF file to \ Run `Grobid <https://github.com/kermitt2/grobid>`_ on a given folder to \
extract references. extract references.
.. note:: .. note::
@ -151,23 +151,30 @@ def grobid(pdf_file):
Before using this function, you have to download and build Grobid on \ Before using this function, you have to download and build Grobid on \
your system. See \ your system. See \
`<https://grobid.readthedocs.org/en/latest/Install-Grobid/>`_ \ `<https://grobid.readthedocs.org/en/latest/Install-Grobid/>`_ \
for more infos on this. You need Java and \ for more infos on this. You need Java to be in your ``$PATH``.
``grobid-core-`<current version>`.one-jar.jar`` to be in your \
``$PATH``.
:param pdf_file: Path to the PDF file to handle. :param pdf_folder: Folder containing the PDF files to handle.
:returns: Raw output from ``Grobid`` or ``None`` if an error occurred. :param grobid_home: Path to the grobid-home directory.
:param grobid_jar: Path to the built Grobid JAR file.
:returns: ``True``, or ``False`` if an error occurred.
""" """
# TODO + update docstring if grobid_home is None or grobid_jar is None:
# TODO: Use https://github.com/kermitt2/grobid-example # User should pass the correct paths
subprocess.check_output(["java", return False
"-jar", "grobid-core-0.3.0.one-jar.jar",
"-Xmx1024m", # Avoid OutOfMemoryException try:
"-gH", "/path/to/Grobid/grobid/grobid-home", subprocess.call(["java",
"-gP", "/path/to/Grobid/grobid-home/config/grobid.properties", "-jar", grobid_jar,
"-dIn", "/path/to/input/directory", # Avoid OutOfMemoryException
"-dOut", "/path/to/output/directory", "-Xmx1024m",
"-exe", "processReferences"]) "-gH", grobid_home,
"-gP", os.path.join(grobid_home,
"config/grobid.properties"),
"-dIn", pdf_folder,
"-exe", "processReferences"])
return True
except subprocess.CalledProcessError:
return False
def pdfextract(pdf_file): def pdfextract(pdf_file):