Browse Source

Add a citations fetcher for bibtex files

Phyks (Lucas Verney) 3 years ago
parent
commit
168e37f247
3 changed files with 82 additions and 7 deletions
  1. 7
    7
      libbmc/citations/bbl.py
  2. 74
    0
      libbmc/citations/bibtex.py
  3. 1
    0
      requirements.txt

+ 7
- 7
libbmc/citations/bbl.py View File

@@ -7,7 +7,7 @@ import re
7 7
 import subprocess
8 8
 
9 9
 from libbmc import tools
10
-from libbmc.citations import bbl
10
+from libbmc.citations import plaintext
11 11
 
12 12
 
13 13
 # Regex to match bibitems
@@ -18,7 +18,7 @@ ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*")
18 18
 
19 19
 def bibitem_as_plaintext(bibitem):
20 20
     """
21
-    Return a plaintext representation of the bibitem from the ``.bbl`` file.
21
+    Return a plaintext representation of a bibitem from the ``.bbl`` file.
22 22
 
23 23
     .. note::
24 24
 
@@ -67,16 +67,16 @@ def get_plaintext_citations(bbl):
67 67
     return cleaned_bbl
68 68
 
69 69
 
70
-def get_cited_DOIs(bbl_input):
70
+def get_cited_DOIs(bbl):
71 71
     """
72
-    Get the DOIs of the papers cited in this .bbl file.
72
+    Get the DOIs of the papers cited in a .bbl file.
73 73
 
74
-    :param bbl_input: Either the path to a .bbl file or the content \
74
+    :param bbl: Either the path to a .bbl file or the content \
75 75
             of a .bbl file.
76 76
 
77 77
     :returns: A dict of cleaned plaintext citations and their associated DOI.
78 78
     """
79 79
     # Get the plaintext citations from the bbl file
80
-    plaintext_citations = get_plaintext_citations(bbl_input)
80
+    plaintext_citations = get_plaintext_citations(bbl)
81 81
     # Use the plaintext citations parser on these citations
82
-    return bbl.get_cited_DOIs(plaintext_citations)
82
+    return plaintext.get_cited_DOIs(plaintext_citations)

+ 74
- 0
libbmc/citations/bibtex.py View File

@@ -0,0 +1,74 @@
1
+"""
2
+This files contains all the functions to extract DOIs of citations from
3
+BibTeX files.
4
+"""
5
+import bibtexparser
6
+import os
7
+
8
+from bibtexparser.bparser import BibTexParser
9
+from bibtexparser.customization import convert_to_unicode
10
+
11
+from libbmc import tools
12
+from libbmc.citations import plaintext
13
+
14
+
15
+def bibentry_as_plaintext(bibentry):
16
+    """
17
+    Return a plaintext representation of a bibentry from BibTeX file.
18
+
19
+    .. note::
20
+
21
+        This plaintext representation can be super ugly, contain URLs and so \
22
+        on.
23
+
24
+    :param bibentry: A bibentry as parsed by ``bibtexparser``.
25
+    :returns: A cleaned plaintext citation from the bibentry.
26
+    """
27
+    # Just flatten the bibentry
28
+    return tools.clean_whitespaces(" ".join([bibentry[k] for k in bibentry]))
29
+
30
+
31
+def get_plaintext_citations(bibtex):
32
+    """
33
+    Parse a BibTeX file to get a clean list of plaintext citations.
34
+
35
+    :param bibtex: Either the path to the BibTeX file or the content of a \
36
+            BibTeX file.
37
+    :returns:  A list of cleaned plaintext citations.
38
+    """
39
+    parser = BibTexParser()
40
+    parser.customization = convert_to_unicode
41
+    # Load the BibTeX
42
+    if os.path.isfile(bibtex):
43
+        with open(bibtex) as fh:
44
+            bib_database = bibtexparser.load(fh, parser=parser)
45
+    else:
46
+        bib_database = bibtexparser.loads(bibtex, parser=parser)
47
+    # Convert bibentries to plaintext
48
+    bibentries = [bibentry_as_plaintext(bibentry)
49
+                  for bibentry in bib_database.entries]
50
+    # Return them
51
+    return bibentries
52
+
53
+
54
+def get_cited_DOIs(bibtex):
55
+    """
56
+    Get the DOIs of the papers cited in a BibTeX file.
57
+
58
+    .. note::
59
+
60
+        For now, this function is actually flattening the BibTeX file \
61
+                (loosing any structure provided by the BibTeX) and calling \
62
+                the matching method for plaintext citations, relying on \
63
+                CrossRef API. This is the best method I have found so far, \
64
+                although it can be quite frustrating. Let me know if you have \
65
+                anything better!
66
+
67
+    :param bibtex: Either the path to a BibTeX file or the content of a \
68
+            BibTeX file.
69
+    :returns: A dict of cleaned plaintext citations and their associated DOI.
70
+    """
71
+    # Get the plaintext citations from the bbl file
72
+    plaintext_citations = get_plaintext_citations(bibtex)
73
+    # Use the plaintext citations parser on these citations
74
+    return plaintext.get_cited_DOIs(plaintext_citations)

+ 1
- 0
requirements.txt View File

@@ -1,3 +1,4 @@
1 1
 arxiv2bib>=1.0.7
2
+bibtexparser>=0.6.2
2 3
 isbnlib>=3.5.7
3 4
 requests>=2.9.1