libbmc/libbmc/citations/bibtex.py

75 lines
2.4 KiB
Python

"""
This files contains all the functions to extract DOIs of citations from
BibTeX files.
"""
import bibtexparser
import os
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import convert_to_unicode
from libbmc import tools
from libbmc.citations import plaintext
def bibentry_as_plaintext(bibentry):
"""
Return a plaintext representation of a bibentry from BibTeX file.
.. note::
This plaintext representation can be super ugly, contain URLs and so \
on.
:param bibentry: A bibentry as parsed by ``bibtexparser``.
:returns: A cleaned plaintext citation from the bibentry.
"""
# Just flatten the bibentry
return tools.clean_whitespaces(" ".join([bibentry[k] for k in bibentry]))
def get_plaintext_citations(bibtex):
"""
Parse a BibTeX file to get a clean list of plaintext citations.
:param bibtex: Either the path to the BibTeX file or the content of a \
BibTeX file.
:returns: A list of cleaned plaintext citations.
"""
parser = BibTexParser()
parser.customization = convert_to_unicode
# Load the BibTeX
if os.path.isfile(bibtex):
with open(bibtex) as fh:
bib_database = bibtexparser.load(fh, parser=parser)
else:
bib_database = bibtexparser.loads(bibtex, parser=parser)
# Convert bibentries to plaintext
bibentries = [bibentry_as_plaintext(bibentry)
for bibentry in bib_database.entries]
# Return them
return bibentries
def get_cited_DOIs(bibtex):
"""
Get the DOIs of the papers cited in a BibTeX file.
.. note::
For now, this function is actually flattening the BibTeX file \
(loosing any structure provided by the BibTeX) and calling \
the matching method for plaintext citations, relying on \
CrossRef API. This is the best method I have found so far, \
although it can be quite frustrating. Let me know if you have \
anything better!
:param bibtex: Either the path to a BibTeX file or the content of a \
BibTeX file.
:returns: A dict of cleaned plaintext citations and their associated DOI.
"""
# Get the plaintext citations from the bbl file
plaintext_citations = get_plaintext_citations(bibtex)
# Use the plaintext citations parser on these citations
return plaintext.get_cited_DOIs(plaintext_citations)