Add a citations fetcher for bibtex files
This commit is contained in:
parent
0d17254f6c
commit
168e37f247
@ -7,7 +7,7 @@ import re
|
||||
import subprocess
|
||||
|
||||
from libbmc import tools
|
||||
from libbmc.citations import bbl
|
||||
from libbmc.citations import plaintext
|
||||
|
||||
|
||||
# Regex to match bibitems
|
||||
@ -18,7 +18,7 @@ ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*")
|
||||
|
||||
def bibitem_as_plaintext(bibitem):
|
||||
"""
|
||||
Return a plaintext representation of the bibitem from the ``.bbl`` file.
|
||||
Return a plaintext representation of a bibitem from the ``.bbl`` file.
|
||||
|
||||
.. note::
|
||||
|
||||
@ -67,16 +67,16 @@ def get_plaintext_citations(bbl):
|
||||
return cleaned_bbl
|
||||
|
||||
|
||||
def get_cited_DOIs(bbl_input):
|
||||
def get_cited_DOIs(bbl):
|
||||
"""
|
||||
Get the DOIs of the papers cited in this .bbl file.
|
||||
Get the DOIs of the papers cited in a .bbl file.
|
||||
|
||||
:param bbl_input: Either the path to a .bbl file or the content \
|
||||
:param bbl: Either the path to a .bbl file or the content \
|
||||
of a .bbl file.
|
||||
|
||||
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
||||
"""
|
||||
# Get the plaintext citations from the bbl file
|
||||
plaintext_citations = get_plaintext_citations(bbl_input)
|
||||
plaintext_citations = get_plaintext_citations(bbl)
|
||||
# Use the plaintext citations parser on these citations
|
||||
return bbl.get_cited_DOIs(plaintext_citations)
|
||||
return plaintext.get_cited_DOIs(plaintext_citations)
|
||||
|
74
libbmc/citations/bibtex.py
Normal file
74
libbmc/citations/bibtex.py
Normal file
@ -0,0 +1,74 @@
|
||||
"""
|
||||
This files contains all the functions to extract DOIs of citations from
|
||||
BibTeX files.
|
||||
"""
|
||||
import bibtexparser
|
||||
import os
|
||||
|
||||
from bibtexparser.bparser import BibTexParser
|
||||
from bibtexparser.customization import convert_to_unicode
|
||||
|
||||
from libbmc import tools
|
||||
from libbmc.citations import plaintext
|
||||
|
||||
|
||||
def bibentry_as_plaintext(bibentry):
|
||||
"""
|
||||
Return a plaintext representation of a bibentry from BibTeX file.
|
||||
|
||||
.. note::
|
||||
|
||||
This plaintext representation can be super ugly, contain URLs and so \
|
||||
on.
|
||||
|
||||
:param bibentry: A bibentry as parsed by ``bibtexparser``.
|
||||
:returns: A cleaned plaintext citation from the bibentry.
|
||||
"""
|
||||
# Just flatten the bibentry
|
||||
return tools.clean_whitespaces(" ".join([bibentry[k] for k in bibentry]))
|
||||
|
||||
|
||||
def get_plaintext_citations(bibtex):
|
||||
"""
|
||||
Parse a BibTeX file to get a clean list of plaintext citations.
|
||||
|
||||
:param bibtex: Either the path to the BibTeX file or the content of a \
|
||||
BibTeX file.
|
||||
:returns: A list of cleaned plaintext citations.
|
||||
"""
|
||||
parser = BibTexParser()
|
||||
parser.customization = convert_to_unicode
|
||||
# Load the BibTeX
|
||||
if os.path.isfile(bibtex):
|
||||
with open(bibtex) as fh:
|
||||
bib_database = bibtexparser.load(fh, parser=parser)
|
||||
else:
|
||||
bib_database = bibtexparser.loads(bibtex, parser=parser)
|
||||
# Convert bibentries to plaintext
|
||||
bibentries = [bibentry_as_plaintext(bibentry)
|
||||
for bibentry in bib_database.entries]
|
||||
# Return them
|
||||
return bibentries
|
||||
|
||||
|
||||
def get_cited_DOIs(bibtex):
|
||||
"""
|
||||
Get the DOIs of the papers cited in a BibTeX file.
|
||||
|
||||
.. note::
|
||||
|
||||
For now, this function is actually flattening the BibTeX file \
|
||||
(loosing any structure provided by the BibTeX) and calling \
|
||||
the matching method for plaintext citations, relying on \
|
||||
CrossRef API. This is the best method I have found so far, \
|
||||
although it can be quite frustrating. Let me know if you have \
|
||||
anything better!
|
||||
|
||||
:param bibtex: Either the path to a BibTeX file or the content of a \
|
||||
BibTeX file.
|
||||
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
||||
"""
|
||||
# Get the plaintext citations from the bbl file
|
||||
plaintext_citations = get_plaintext_citations(bibtex)
|
||||
# Use the plaintext citations parser on these citations
|
||||
return plaintext.get_cited_DOIs(plaintext_citations)
|
@ -1,3 +1,4 @@
|
||||
arxiv2bib>=1.0.7
|
||||
bibtexparser>=0.6.2
|
||||
isbnlib>=3.5.7
|
||||
requests>=2.9.1
|
||||
|
Loading…
Reference in New Issue
Block a user