diff --git a/libbmc/citations/pdf.py b/libbmc/citations/pdf.py new file mode 100644 index 0000000..bb6182e --- /dev/null +++ b/libbmc/citations/pdf.py @@ -0,0 +1,126 @@ +""" +This files contains all the functions to extract DOIs of citations from +PDF files. +""" +import requests +import subprocess +import xml.etree.ElementTree as ET + +from requests.exceptions import RequestException + +from libbmc.citations import plaintext + + +CERMINE_BASE_URL = "http://cermine.ceon.pl/" + + +def cermine(pdf_file): + """ + Run `CERMINE `_ to extract procedure on \ + the given PDF file, to retrieve citations (and more) from the \ + provided PDF file. + + .. note:: + + This uses the `CERMINE API `_, and \ + hence, uploads the PDF file (so uses network). Check out \ + the CERMINE API terms. + + :param pdf_file: Path to the PDF file to handle. + :returns: Raw output from CERMINE API or ``None`` if an error occurred. \ + No post-processing is done. + """ + try: + with open(pdf_file, "rb") as fh: + r = requests.post( + CERMINE_BASE_URL + "extract.do", + headers={"Content-Type": "application/binary"}, + files={"file": fh} + ) + return r.text + except (RequestException, FileNotFoundError): + return None + + +def grobid(pdf_file): + """ + Run `Grobid `_ on a given PDF file to \ + extract references. + + .. note:: + + Before using this function, you have to download and build Grobid on \ + your system. See \ + ``_ \ + for more infos on this. You need Java and \ + ``grobid-core-``.one-jar.jar`` to be in your \ + ``$PATH``. + + :param pdf_file: Path to the PDF file to handle. + :returns: Raw output from ``Grobid`` or ``None`` if an error occurred. + """ + # TODO + update docstring + # TODO: Use https://github.com/kermitt2/grobid-example + subprocess.check_output(["java", + "-jar", "grobid-core-0.3.0.one-jar.jar", + "-Xmx1024m", # Avoid OutOfMemoryException + "-gH", "/path/to/Grobid/grobid/grobid-home", + "-gP", "/path/to/Grobid/grobid-home/config/grobid.properties", + "-dIn", "/path/to/input/directory", + "-dOut", "/path/to/output/directory", + "-exe", "processReferences"]) + + +def pdfextract(pdf_file): + """ + Run `pdfextract `_ on a given PDF \ + file to extract references. + + .. note:: + + Before using this function, you have to install pdfextract on \ + your system. See \ + ``_ \ + for more infos on this. You need the ``pdf-extract`` command \ + to be in your ``$PATH``. This can be done easily using \ + ``gem install pdf-extract``, provided that you have a correct \ + Ruby install on your system. + + :param pdf_file: Path to the PDF file to handle. + :returns: Raw output from ``pdfextract`` or ``None`` if an error \ + occurred. No post-processing is done. See \ + ``libbmc.citations.pdf.pdfextract_dois`` for a similar function \ + with post-processing to return DOIs. + """ + # Run pdf-extract + try: + references = subprocess.check_output(["pdf-extract", + "extract", "--references", + pdf_file]) + return references + except subprocess.CalledProcessError: + return None + + +def pdfextract_dois(pdf_file): + """ + Extract DOIs of references using \ + `pdfextract `_. + + .. note:: + + See ``libbmc.citations.pdf.pdfextract`` function as this one is just \ + a wrapper around it. + See ``libbmc.citations.plaintext.get_cited_dois`` as well for the \ + returned value, as it is ultimately called by this function. + + :param pdf_file: Path to the PDF file to handle. + :returns: A dict of cleaned plaintext citations and their associated DOI. + """ + # Call pdf-extract on the PDF file + references = pdfextract(pdf_file) + # Parse the resulting XML + root = ET.fromstring(references) + plaintext_references = [e.text for e in root.iter("reference")] + # Call the plaintext methods to fetch DOIs + return plaintext.get_cited_DOIs(plaintext_references)