""" This files contains all the functions to extract DOIs of citations from PDF files. # TODO: Unittests """ import requests import subprocess import xml.etree.ElementTree as ET from requests.exceptions import RequestException from libbmc.citations import plaintext CERMINE_BASE_URL = "http://cermine.ceon.pl/" def cermine(pdf_file): """ Run `CERMINE `_ to extract procedure on \ the given PDF file, to retrieve citations (and more) from the \ provided PDF file. .. note:: This uses the `CERMINE API `_, and \ hence, uploads the PDF file (so uses network). Check out \ the CERMINE API terms. :param pdf_file: Path to the PDF file to handle. :returns: Raw output from CERMINE API or ``None`` if an error occurred. \ No post-processing is done. """ try: with open(pdf_file, "rb") as fh: r = requests.post( CERMINE_BASE_URL + "extract.do", headers={"Content-Type": "application/binary"}, files={"file": fh} ) return r.text except (RequestException, FileNotFoundError): return None def grobid(pdf_file): """ Run `Grobid `_ on a given PDF file to \ extract references. .. note:: Before using this function, you have to download and build Grobid on \ your system. See \ ``_ \ for more infos on this. You need Java and \ ``grobid-core-``.one-jar.jar`` to be in your \ ``$PATH``. :param pdf_file: Path to the PDF file to handle. :returns: Raw output from ``Grobid`` or ``None`` if an error occurred. """ # TODO + update docstring # TODO: Use https://github.com/kermitt2/grobid-example subprocess.check_output(["java", "-jar", "grobid-core-0.3.0.one-jar.jar", "-Xmx1024m", # Avoid OutOfMemoryException "-gH", "/path/to/Grobid/grobid/grobid-home", "-gP", "/path/to/Grobid/grobid-home/config/grobid.properties", "-dIn", "/path/to/input/directory", "-dOut", "/path/to/output/directory", "-exe", "processReferences"]) def pdfextract(pdf_file): """ Run `pdfextract `_ on a given PDF \ file to extract references. .. note:: Before using this function, you have to install pdfextract on \ your system. See \ ``_ \ for more infos on this. You need the ``pdf-extract`` command \ to be in your ``$PATH``. This can be done easily using \ ``gem install pdf-extract``, provided that you have a correct \ Ruby install on your system. .. note:: ``pdfextract`` is full a bugs and as the time of writing this, \ you had to manually ``gem install pdf-reader -v 1.2.0`` \ before installing ``pdfextract`` or you would get errors. See \ `this Github issue `_. :param pdf_file: Path to the PDF file to handle. :returns: Raw output from ``pdfextract`` or ``None`` if an error \ occurred. No post-processing is done. See \ ``libbmc.citations.pdf.pdfextract_dois`` for a similar function \ with post-processing to return DOIs. """ try: # Run pdf-extract references = subprocess.check_output(["pdf-extract", "extract", "--references", pdf_file]) return references except subprocess.CalledProcessError: return None def pdfextract_dois(pdf_file): """ Extract DOIs of references using \ `pdfextract `_. .. note:: See ``libbmc.citations.pdf.pdfextract`` function as this one is just \ a wrapper around it. See ``libbmc.citations.plaintext.get_cited_dois`` as well for the \ returned value, as it is ultimately called by this function. :param pdf_file: Path to the PDF file to handle. :returns: A dict of cleaned plaintext citations and their associated DOI. """ # Call pdf-extract on the PDF file references = pdfextract(pdf_file) # Parse the resulting XML root = ET.fromstring(references) plaintext_references = [e.text for e in root.iter("reference")] # Call the plaintext methods to fetch DOIs return plaintext.get_cited_DOIs(plaintext_references)