2015-12-27 23:46:43 +01:00
|
|
|
"""
|
2015-12-28 00:21:41 +01:00
|
|
|
This files contains all the functions to extract DOIs of citations from .bbl
|
|
|
|
files.
|
2015-12-27 23:46:43 +01:00
|
|
|
"""
|
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import subprocess
|
|
|
|
|
|
|
|
from libbmc import tools
|
2015-12-28 00:42:27 +01:00
|
|
|
from libbmc.citations import plaintext
|
2015-12-27 23:46:43 +01:00
|
|
|
|
|
|
|
|
|
|
|
# Regex to match bibitems
|
|
|
|
BIBITEMS_REGEX = re.compile(r"\\bibitem\{.+?\}")
|
|
|
|
# Regex to match end of bibliography
|
|
|
|
ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*")
|
|
|
|
|
|
|
|
|
|
|
|
def bibitem_as_plaintext(bibitem):
|
|
|
|
"""
|
2015-12-28 00:42:27 +01:00
|
|
|
Return a plaintext representation of a bibitem from the ``.bbl`` file.
|
2015-12-27 23:46:43 +01:00
|
|
|
|
|
|
|
.. note::
|
|
|
|
|
|
|
|
This plaintext representation can be super ugly, contain URLs and so \
|
|
|
|
on.
|
|
|
|
|
|
|
|
:param bibitem: The text content of the bibitem.
|
|
|
|
:returns: A cleaned plaintext citation from the bibitem.
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
output = subprocess.check_output(["delatex",
|
|
|
|
"-s"],
|
|
|
|
input=bibitem.encode("utf-8"))
|
|
|
|
except FileNotFoundError:
|
|
|
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
output = subprocess.check_output(["%s/../external/opendetex/delatex" %
|
|
|
|
(script_dir,),
|
|
|
|
"-s"],
|
|
|
|
input=bibitem.encode("utf-8"))
|
|
|
|
output = output.decode("utf-8")
|
|
|
|
output = tools.clean_whitespaces(output)
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
|
|
def get_plaintext_citations(bbl):
|
|
|
|
"""
|
|
|
|
Parse a ``*.bbl`` file to get a clean list of plaintext citations.
|
|
|
|
|
|
|
|
:param bbl: Either the path to the .bbl file or the content of a ``.bbl`` \
|
|
|
|
file.
|
|
|
|
:returns: A list of cleaned plaintext citations.
|
|
|
|
"""
|
|
|
|
# Handle path or content
|
|
|
|
if os.path.isfile(bbl):
|
|
|
|
with open(bbl, 'r') as fh:
|
|
|
|
bbl_content = fh.read()
|
|
|
|
else:
|
|
|
|
bbl_content = bbl
|
|
|
|
# Get a list of bibitems, taking the first item out as it is *before* the
|
|
|
|
# first \bibitem
|
|
|
|
bibitems = BIBITEMS_REGEX.split(bbl_content)[1:]
|
|
|
|
# Delete the text after the \end{thebibliography}
|
|
|
|
bibitems = [ENDTHEBIBLIOGRAPHY_REGEX.sub("", i).strip() for i in bibitems]
|
|
|
|
# Clean every bibitem to have plaintext
|
|
|
|
cleaned_bbl = [bibitem_as_plaintext(bibitem) for bibitem in bibitems]
|
|
|
|
return cleaned_bbl
|
|
|
|
|
|
|
|
|
2015-12-28 00:42:27 +01:00
|
|
|
def get_cited_DOIs(bbl):
|
2015-12-27 23:46:43 +01:00
|
|
|
"""
|
2015-12-28 00:42:27 +01:00
|
|
|
Get the DOIs of the papers cited in a .bbl file.
|
2015-12-27 23:46:43 +01:00
|
|
|
|
2015-12-28 00:42:27 +01:00
|
|
|
:param bbl: Either the path to a .bbl file or the content \
|
2015-12-28 00:21:41 +01:00
|
|
|
of a .bbl file.
|
2015-12-27 23:46:43 +01:00
|
|
|
|
|
|
|
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
|
|
|
"""
|
|
|
|
# Get the plaintext citations from the bbl file
|
2015-12-28 00:42:27 +01:00
|
|
|
plaintext_citations = get_plaintext_citations(bbl)
|
2015-12-28 00:21:41 +01:00
|
|
|
# Use the plaintext citations parser on these citations
|
2015-12-28 00:42:27 +01:00
|
|
|
return plaintext.get_cited_DOIs(plaintext_citations)
|