75 lines
2.4 KiB
Python
75 lines
2.4 KiB
Python
"""
|
|
This files contains all the functions to extract DOIs of citations from
|
|
BibTeX files.
|
|
"""
|
|
import bibtexparser
|
|
import os
|
|
|
|
from bibtexparser.bparser import BibTexParser
|
|
from bibtexparser.customization import convert_to_unicode
|
|
|
|
from libbmc import tools
|
|
from libbmc.citations import plaintext
|
|
|
|
|
|
def bibentry_as_plaintext(bibentry):
|
|
"""
|
|
Return a plaintext representation of a bibentry from BibTeX file.
|
|
|
|
.. note::
|
|
|
|
This plaintext representation can be super ugly, contain URLs and so \
|
|
on.
|
|
|
|
:param bibentry: A bibentry as parsed by ``bibtexparser``.
|
|
:returns: A cleaned plaintext citation from the bibentry.
|
|
"""
|
|
# Just flatten the bibentry
|
|
return tools.clean_whitespaces(" ".join([bibentry[k] for k in bibentry]))
|
|
|
|
|
|
def get_plaintext_citations(bibtex):
|
|
"""
|
|
Parse a BibTeX file to get a clean list of plaintext citations.
|
|
|
|
:param bibtex: Either the path to the BibTeX file or the content of a \
|
|
BibTeX file.
|
|
:returns: A list of cleaned plaintext citations.
|
|
"""
|
|
parser = BibTexParser()
|
|
parser.customization = convert_to_unicode
|
|
# Load the BibTeX
|
|
if os.path.isfile(bibtex):
|
|
with open(bibtex) as fh:
|
|
bib_database = bibtexparser.load(fh, parser=parser)
|
|
else:
|
|
bib_database = bibtexparser.loads(bibtex, parser=parser)
|
|
# Convert bibentries to plaintext
|
|
bibentries = [bibentry_as_plaintext(bibentry)
|
|
for bibentry in bib_database.entries]
|
|
# Return them
|
|
return bibentries
|
|
|
|
|
|
def get_cited_DOIs(bibtex):
|
|
"""
|
|
Get the DOIs of the papers cited in a BibTeX file.
|
|
|
|
.. note::
|
|
|
|
For now, this function is actually flattening the BibTeX file \
|
|
(loosing any structure provided by the BibTeX) and calling \
|
|
the matching method for plaintext citations, relying on \
|
|
CrossRef API. This is the best method I have found so far, \
|
|
although it can be quite frustrating. Let me know if you have \
|
|
anything better!
|
|
|
|
:param bibtex: Either the path to a BibTeX file or the content of a \
|
|
BibTeX file.
|
|
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
|
"""
|
|
# Get the plaintext citations from the bbl file
|
|
plaintext_citations = get_plaintext_citations(bibtex)
|
|
# Use the plaintext citations parser on these citations
|
|
return plaintext.get_cited_DOIs(plaintext_citations)
|