Add a citations fetcher for bibtex files

This commit is contained in:
Lucas Verney 2015-12-28 00:42:27 +01:00
parent 0d17254f6c
commit 168e37f247
3 changed files with 82 additions and 7 deletions

View File

@ -7,7 +7,7 @@ import re
import subprocess import subprocess
from libbmc import tools from libbmc import tools
from libbmc.citations import bbl from libbmc.citations import plaintext
# Regex to match bibitems # Regex to match bibitems
@ -18,7 +18,7 @@ ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*")
def bibitem_as_plaintext(bibitem): def bibitem_as_plaintext(bibitem):
""" """
Return a plaintext representation of the bibitem from the ``.bbl`` file. Return a plaintext representation of a bibitem from the ``.bbl`` file.
.. note:: .. note::
@ -67,16 +67,16 @@ def get_plaintext_citations(bbl):
return cleaned_bbl return cleaned_bbl
def get_cited_DOIs(bbl_input): def get_cited_DOIs(bbl):
""" """
Get the DOIs of the papers cited in this .bbl file. Get the DOIs of the papers cited in a .bbl file.
:param bbl_input: Either the path to a .bbl file or the content \ :param bbl: Either the path to a .bbl file or the content \
of a .bbl file. of a .bbl file.
:returns: A dict of cleaned plaintext citations and their associated DOI. :returns: A dict of cleaned plaintext citations and their associated DOI.
""" """
# Get the plaintext citations from the bbl file # Get the plaintext citations from the bbl file
plaintext_citations = get_plaintext_citations(bbl_input) plaintext_citations = get_plaintext_citations(bbl)
# Use the plaintext citations parser on these citations # Use the plaintext citations parser on these citations
return bbl.get_cited_DOIs(plaintext_citations) return plaintext.get_cited_DOIs(plaintext_citations)

View File

@ -0,0 +1,74 @@
"""
This files contains all the functions to extract DOIs of citations from
BibTeX files.
"""
import bibtexparser
import os
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import convert_to_unicode
from libbmc import tools
from libbmc.citations import plaintext
def bibentry_as_plaintext(bibentry):
"""
Return a plaintext representation of a bibentry from BibTeX file.
.. note::
This plaintext representation can be super ugly, contain URLs and so \
on.
:param bibentry: A bibentry as parsed by ``bibtexparser``.
:returns: A cleaned plaintext citation from the bibentry.
"""
# Just flatten the bibentry
return tools.clean_whitespaces(" ".join([bibentry[k] for k in bibentry]))
def get_plaintext_citations(bibtex):
"""
Parse a BibTeX file to get a clean list of plaintext citations.
:param bibtex: Either the path to the BibTeX file or the content of a \
BibTeX file.
:returns: A list of cleaned plaintext citations.
"""
parser = BibTexParser()
parser.customization = convert_to_unicode
# Load the BibTeX
if os.path.isfile(bibtex):
with open(bibtex) as fh:
bib_database = bibtexparser.load(fh, parser=parser)
else:
bib_database = bibtexparser.loads(bibtex, parser=parser)
# Convert bibentries to plaintext
bibentries = [bibentry_as_plaintext(bibentry)
for bibentry in bib_database.entries]
# Return them
return bibentries
def get_cited_DOIs(bibtex):
"""
Get the DOIs of the papers cited in a BibTeX file.
.. note::
For now, this function is actually flattening the BibTeX file \
(loosing any structure provided by the BibTeX) and calling \
the matching method for plaintext citations, relying on \
CrossRef API. This is the best method I have found so far, \
although it can be quite frustrating. Let me know if you have \
anything better!
:param bibtex: Either the path to a BibTeX file or the content of a \
BibTeX file.
:returns: A dict of cleaned plaintext citations and their associated DOI.
"""
# Get the plaintext citations from the bbl file
plaintext_citations = get_plaintext_citations(bibtex)
# Use the plaintext citations parser on these citations
return plaintext.get_cited_DOIs(plaintext_citations)

View File

@ -1,3 +1,4 @@
arxiv2bib>=1.0.7 arxiv2bib>=1.0.7
bibtexparser>=0.6.2
isbnlib>=3.5.7 isbnlib>=3.5.7
requests>=2.9.1 requests>=2.9.1