From 168e37f2474b78fe0aa981294807d95e7fccb1b9 Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Mon, 28 Dec 2015 00:42:27 +0100 Subject: [PATCH] Add a citations fetcher for bibtex files --- libbmc/citations/bbl.py | 14 ++++---- libbmc/citations/bibtex.py | 74 ++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 3 files changed, 82 insertions(+), 7 deletions(-) create mode 100644 libbmc/citations/bibtex.py diff --git a/libbmc/citations/bbl.py b/libbmc/citations/bbl.py index 8de79ab..f36509b 100644 --- a/libbmc/citations/bbl.py +++ b/libbmc/citations/bbl.py @@ -7,7 +7,7 @@ import re import subprocess from libbmc import tools -from libbmc.citations import bbl +from libbmc.citations import plaintext # Regex to match bibitems @@ -18,7 +18,7 @@ ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*") def bibitem_as_plaintext(bibitem): """ - Return a plaintext representation of the bibitem from the ``.bbl`` file. + Return a plaintext representation of a bibitem from the ``.bbl`` file. .. note:: @@ -67,16 +67,16 @@ def get_plaintext_citations(bbl): return cleaned_bbl -def get_cited_DOIs(bbl_input): +def get_cited_DOIs(bbl): """ - Get the DOIs of the papers cited in this .bbl file. + Get the DOIs of the papers cited in a .bbl file. - :param bbl_input: Either the path to a .bbl file or the content \ + :param bbl: Either the path to a .bbl file or the content \ of a .bbl file. :returns: A dict of cleaned plaintext citations and their associated DOI. """ # Get the plaintext citations from the bbl file - plaintext_citations = get_plaintext_citations(bbl_input) + plaintext_citations = get_plaintext_citations(bbl) # Use the plaintext citations parser on these citations - return bbl.get_cited_DOIs(plaintext_citations) + return plaintext.get_cited_DOIs(plaintext_citations) diff --git a/libbmc/citations/bibtex.py b/libbmc/citations/bibtex.py new file mode 100644 index 0000000..d241141 --- /dev/null +++ b/libbmc/citations/bibtex.py @@ -0,0 +1,74 @@ +""" +This files contains all the functions to extract DOIs of citations from +BibTeX files. +""" +import bibtexparser +import os + +from bibtexparser.bparser import BibTexParser +from bibtexparser.customization import convert_to_unicode + +from libbmc import tools +from libbmc.citations import plaintext + + +def bibentry_as_plaintext(bibentry): + """ + Return a plaintext representation of a bibentry from BibTeX file. + + .. note:: + + This plaintext representation can be super ugly, contain URLs and so \ + on. + + :param bibentry: A bibentry as parsed by ``bibtexparser``. + :returns: A cleaned plaintext citation from the bibentry. + """ + # Just flatten the bibentry + return tools.clean_whitespaces(" ".join([bibentry[k] for k in bibentry])) + + +def get_plaintext_citations(bibtex): + """ + Parse a BibTeX file to get a clean list of plaintext citations. + + :param bibtex: Either the path to the BibTeX file or the content of a \ + BibTeX file. + :returns: A list of cleaned plaintext citations. + """ + parser = BibTexParser() + parser.customization = convert_to_unicode + # Load the BibTeX + if os.path.isfile(bibtex): + with open(bibtex) as fh: + bib_database = bibtexparser.load(fh, parser=parser) + else: + bib_database = bibtexparser.loads(bibtex, parser=parser) + # Convert bibentries to plaintext + bibentries = [bibentry_as_plaintext(bibentry) + for bibentry in bib_database.entries] + # Return them + return bibentries + + +def get_cited_DOIs(bibtex): + """ + Get the DOIs of the papers cited in a BibTeX file. + + .. note:: + + For now, this function is actually flattening the BibTeX file \ + (loosing any structure provided by the BibTeX) and calling \ + the matching method for plaintext citations, relying on \ + CrossRef API. This is the best method I have found so far, \ + although it can be quite frustrating. Let me know if you have \ + anything better! + + :param bibtex: Either the path to a BibTeX file or the content of a \ + BibTeX file. + :returns: A dict of cleaned plaintext citations and their associated DOI. + """ + # Get the plaintext citations from the bbl file + plaintext_citations = get_plaintext_citations(bibtex) + # Use the plaintext citations parser on these citations + return plaintext.get_cited_DOIs(plaintext_citations) diff --git a/requirements.txt b/requirements.txt index 0caebe8..91bb5ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ arxiv2bib>=1.0.7 +bibtexparser>=0.6.2 isbnlib>=3.5.7 requests>=2.9.1