First commit

2015-12-23 19:29:10 +01:00 · 2015-12-23 19:29:10 +01:00 · 9ab4141d60
commit 9ab4141d60
5 changed files with 88 additions and 0 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
 [submodule "opendetex"]
 	path = opendetex
 	url = https://github.com/Phyks/opendetex
--- a/README.md
+++ b/README.md
@ -0,0 +1,16 @@
 Metadata for arXiv
 ==================
 ## Installation
 For building `opendetex` (which is a necessary dependency), you will need
 `gcc`, `flex` and `make`.
 * Clone this repository: `git clone https://github.com/Phyks/arxiv_metadata`.
 * Init submodules (`opendetex`): `git submodule init; git submodule update`.
 * Build `opendetex`: `cd opendetex; make`.
 * You are ready to go.
 ## Usage
 `./main.py some_file.bbl` to get a list of DOIs associated to each `\bibitem`.
--- a/main.py
+++ b/main.py
@ -0,0 +1,67 @@
 #!/usr/bin/env python3
 import math
 import os
 import re
 import requests
 import subprocess
 import sys
 def oa_from_doi(doi):
    """
    Get an OA version for a given DOI.
    """
    # http://beta.dissem.in/api/10.1088/1367-2630/17/9/093036
    pass
 def clean_bibitem(bibitem):
    """
    Return a plaintext representation of the bibitem from the bbl file.
    """
    script_dir = os.path.dirname(os.path.abspath(__file__))
    output = subprocess.check_output([script_dir + "/opendetex/delatex", "-s"],
                                     input=bibitem.encode("utf-8"))
    output = output.decode("utf-8")
    output = ' '.join(output.strip().rstrip(".,").split())
    return output
 def parse_bbl(bbl_file):
    with open(bbl_file, 'r') as fh:
        bbl_content = fh.read()
    bibitems = re.split(r"\\bibitem\{.+?\}", bbl_content)[1:]
    bibitems = [re.sub(r"\\end\{thebibliography}",
                       "",
                       i).strip() for i in bibitems]
    cleaned_bbl = []
    for bibitem in bibitems:
        cleaned_bbl.append(clean_bibitem(bibitem))
    return cleaned_bbl
 def dois_from_bbl(bbl_file):
    """
    Get the papers cited by the paper identified by the given DOI.
    """
    cleaned_citations = parse_bbl(bbl_file)
    dois = {}
    for i in range(math.ceil(len(cleaned_citations) / 10)):
        lower_bound = 10 * i
        upper_bound = min(10 * (i + 1), len(cleaned_citations))
        r = requests.post("http://search.crossref.org/links",
                          json=cleaned_citations[lower_bound:upper_bound])
        for result in r.json()["results"]:
            if "doi" not in result:
                dois[result["text"]] = None
            else:
                dois[result["text"]] = result["doi"]
    return dois
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        sys.exit("Usage: " + sys.argv[0] + " BBL_FILE.")
    import pprint
    pprint.pprint(dois_from_bbl(sys.argv[1]))
--- a/1
+++ b/1
@ -0,0 +1 @@
 Subproject commit 284f59211829aab75e8f07423da195bc630146c2
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
 requests>=2.4.2
		`@ -0,0 +1 @@`
							`Subproject commit 284f59211829aab75e8f07423da195bc630146c2`