Add functions to extract references from a PDF file

Add some functions to extract references from a PDF file. They are basically wrappers around Cermine, Grobid and pdf-extract. The Grobid wrapper is still to be done and more deeply embedded in the toolchain.
2016-01-19 17:45:58 +01:00 · 2016-01-19 17:45:58 +01:00 · e9d7f3ad78
commit e9d7f3ad78
parent 962b4adc23
1 changed files with 126 additions and 0 deletions
--- a/libbmc/citations/pdf.py
+++ b/libbmc/citations/pdf.py
@ -0,0 +1,126 @@
 """
 This files contains all the functions to extract DOIs of citations from
 PDF files.
 """
 import requests
 import subprocess
 import xml.etree.ElementTree as ET
 from requests.exceptions import RequestException
 from libbmc.citations import plaintext
 CERMINE_BASE_URL = "http://cermine.ceon.pl/"
 def cermine(pdf_file):
    """
    Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract procedure on \
            the given PDF file, to retrieve citations (and more) from the \
            provided PDF file.
    .. note::
        This uses the `CERMINE API <http://cermine.ceon.pl/about.html>`_, and \
                hence, uploads the PDF file (so uses network). Check out \
                the CERMINE API terms.
    :param pdf_file: Path to the PDF file to handle.
    :returns: Raw output from CERMINE API or ``None`` if an error occurred. \
            No post-processing is done.
    """
    try:
        with open(pdf_file, "rb") as fh:
            r = requests.post(
                CERMINE_BASE_URL + "extract.do",
                headers={"Content-Type": "application/binary"},
                files={"file": fh}
            )
        return r.text
    except (RequestException, FileNotFoundError):
        return None
 def grobid(pdf_file):
    """
    Run `Grobid <https://github.com/kermitt2/grobid>`_ on a given PDF file to \
            extract references.
    .. note::
        Before using this function, you have to download and build Grobid on \
                your system. See \
                `<https://grobid.readthedocs.org/en/latest/Install-Grobid/>`_ \
                for more infos on this. You need Java and \
                ``grobid-core-`<current version>`.one-jar.jar`` to be in your \
                ``$PATH``.
    :param pdf_file: Path to the PDF file to handle.
    :returns: Raw output from ``Grobid`` or ``None`` if an error occurred.
    """
    # TODO + update docstring
    # TODO: Use https://github.com/kermitt2/grobid-example
    subprocess.check_output(["java",
                             "-jar", "grobid-core-0.3.0.one-jar.jar",
                             "-Xmx1024m",  # Avoid OutOfMemoryException
                             "-gH", "/path/to/Grobid/grobid/grobid-home",
                             "-gP", "/path/to/Grobid/grobid-home/config/grobid.properties",
                             "-dIn", "/path/to/input/directory",
                             "-dOut", "/path/to/output/directory",
                             "-exe", "processReferences"])
 def pdfextract(pdf_file):
    """
    Run `pdfextract <https://github.com/CrossRef/pdfextract>`_ on a given PDF \
            file to extract references.
    .. note::
        Before using this function, you have to install pdfextract on \
                your system. See \
                `<https://github.com/CrossRef/pdfextract#quick-start>`_ \
                for more infos on this. You need the ``pdf-extract`` command \
                to be in your ``$PATH``. This can be done easily using \
                ``gem install pdf-extract``, provided that you have a correct \
                Ruby install on your system.
    :param pdf_file: Path to the PDF file to handle.
    :returns: Raw output from ``pdfextract`` or ``None`` if an error \
            occurred. No post-processing is done. See \
            ``libbmc.citations.pdf.pdfextract_dois`` for a similar function \
            with post-processing to return DOIs.
    """
    # Run pdf-extract
    try:
        references = subprocess.check_output(["pdf-extract",
                                              "extract", "--references",
                                              pdf_file])
        return references
    except subprocess.CalledProcessError:
        return None
 def pdfextract_dois(pdf_file):
    """
    Extract DOIs of references using \
            `pdfextract <https://github.com/CrossRef/pdfextract>`_.
    .. note::
        See ``libbmc.citations.pdf.pdfextract`` function as this one is just \
                a wrapper around it.
        See ``libbmc.citations.plaintext.get_cited_dois`` as well for the \
                returned value, as it is ultimately called by this function.
    :param pdf_file: Path to the PDF file to handle.
    :returns: A dict of cleaned plaintext citations and their associated DOI.
    """
    # Call pdf-extract on the PDF file
    references = pdfextract(pdf_file)
    # Parse the resulting XML
    root = ET.fromstring(references)
    plaintext_references = [e.text for e in root.iter("reference")]
    # Call the plaintext methods to fetch DOIs
    return plaintext.get_cited_DOIs(plaintext_references)