""" This files contains all the functions to extract DOIs of citations from PDF files. """ import requests import subprocess import xml.etree.ElementTree as ET from requests.exceptions import RequestException from libbmc.citations import plaintext CERMINE_BASE_URL = "http://cermine.ceon.pl/" def cermine(pdf_file): """ Run `CERMINE `_ to extract procedure on \ the given PDF file, to retrieve citations (and more) from the \ provided PDF file. .. note:: This uses the `CERMINE API `_, and \ hence, uploads the PDF file (so uses network). Check out \ the CERMINE API terms. :param pdf_file: Path to the PDF file to handle. :returns: Raw output from CERMINE API or ``None`` if an error occurred. \ No post-processing is done. """ try: with open(pdf_file, "rb") as fh: r = requests.post( CERMINE_BASE_URL + "extract.do", headers={"Content-Type": "application/binary"}, files={"file": fh} ) return r.text except (RequestException, FileNotFoundError): return None def grobid(pdf_file): """ Run `Grobid `_ on a given PDF file to \ extract references. .. note:: Before using this function, you have to download and build Grobid on \ your system. See \ ``_ \ for more infos on this. You need Java and \ ``grobid-core-``.one-jar.jar`` to be in your \ ``$PATH``. :param pdf_file: Path to the PDF file to handle. :returns: Raw output from ``Grobid`` or ``None`` if an error occurred. """ # TODO + update docstring # TODO: Use https://github.com/kermitt2/grobid-example subprocess.check_output(["java", "-jar", "grobid-core-0.3.0.one-jar.jar", "-Xmx1024m", # Avoid OutOfMemoryException "-gH", "/path/to/Grobid/grobid/grobid-home", "-gP", "/path/to/Grobid/grobid-home/config/grobid.properties", "-dIn", "/path/to/input/directory", "-dOut", "/path/to/output/directory", "-exe", "processReferences"]) def pdfextract(pdf_file): """ Run `pdfextract `_ on a given PDF \ file to extract references. .. note:: Before using this function, you have to install pdfextract on \ your system. See \ ``_ \ for more infos on this. You need the ``pdf-extract`` command \ to be in your ``$PATH``. This can be done easily using \ ``gem install pdf-extract``, provided that you have a correct \ Ruby install on your system. .. note:: ``pdfextract`` is full a bugs and as the time of writing this, \ you had to manually ``gem install pdf-reader -v 1.2.0`` \ before installing ``pdfextract`` or you would get errors. See \ `this Github issue `_. :param pdf_file: Path to the PDF file to handle. :returns: Raw output from ``pdfextract`` or ``None`` if an error \ occurred. No post-processing is done. See \ ``libbmc.citations.pdf.pdfextract_dois`` for a similar function \ with post-processing to return DOIs. """ try: # Run pdf-extract references = subprocess.check_output(["pdf-extract", "extract", "--references", pdf_file]) return references except subprocess.CalledProcessError: return None def pdfextract_dois(pdf_file): """ Extract DOIs of references using \ `pdfextract `_. .. note:: See ``libbmc.citations.pdf.pdfextract`` function as this one is just \ a wrapper around it. See ``libbmc.citations.plaintext.get_cited_dois`` as well for the \ returned value, as it is ultimately called by this function. :param pdf_file: Path to the PDF file to handle. :returns: A dict of cleaned plaintext citations and their associated DOI. """ # Call pdf-extract on the PDF file references = pdfextract(pdf_file) # Parse the resulting XML root = ET.fromstring(references) plaintext_references = [e.text for e in root.iter("reference")] # Call the plaintext methods to fetch DOIs return plaintext.get_cited_DOIs(plaintext_references)