Browse Source

References extraction using CERMINE

Phyks (Lucas Verney) 3 years ago
parent
commit
8a905a9776
1 changed files with 17 additions and 3 deletions
  1. 17
    3
      libbmc/citations/pdf.py

+ 17
- 3
libbmc/citations/pdf.py View File

@@ -11,6 +11,7 @@ import xml.etree.ElementTree as ET
11 11
 
12 12
 from requests.exceptions import RequestException
13 13
 
14
+from libbmc import tools
14 15
 from libbmc.citations import plaintext
15 16
 
16 17
 
@@ -79,7 +80,7 @@ def cermine(pdf_file, force_API=False, override_local=None):
79 80
                 "java",
80 81
                 "-cp", local,
81 82
                 "pl.edu.icm.cermine.PdfNLMContentExtractor",
82
-                "-path", pdf_file])
83
+                "-path", pdf_file]).decode("utf-8")
83 84
     except (RequestException,
84 85
             subprocess.CalledProcessError,
85 86
             FileNotFoundError):
@@ -123,8 +124,21 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
123 124
             the default location (``libbmc/external/cermine.jar``).
124 125
     :returns: A dict of cleaned plaintext citations and their associated DOI.
125 126
     """
126
-    # TODO
127
-    pass
127
+    # TODO:
128
+    #    * Do not convert to plain text, but use the extra metadata from
129
+    #      CERMINE
130
+    # Call CERMINE on the PDF file
131
+    cermine_output = cermine(pdf_file, force_API, override_local)
132
+    # Parse the resulting XML
133
+    root = ET.fromstring(cermine_output)
134
+    plaintext_references = [
135
+        # Remove extra whitespaces
136
+        tools.clean_whitespaces(
137
+            # Convert XML element to string, discarding any leading "[n]"
138
+            ET.tostring(e, method="text").decode("utf-8").replace(e.text, ""))
139
+        for e in root.iter("mixed-citation")]
140
+    # Call the plaintext methods to fetch DOIs
141
+    return plaintext.get_cited_DOIs(plaintext_references)
128 142
 
129 143
 
130 144
 def grobid(pdf_file):