commit 9ab4141d6056d2727016b769ceddde3781bdb050 Author: Phyks (Lucas Verney) Date: Wed Dec 23 19:29:10 2015 +0100 First commit diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..cee79a4 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "opendetex"] + path = opendetex + url = https://github.com/Phyks/opendetex diff --git a/README.md b/README.md new file mode 100644 index 0000000..9fc9865 --- /dev/null +++ b/README.md @@ -0,0 +1,16 @@ +Metadata for arXiv +================== + +## Installation + +For building `opendetex` (which is a necessary dependency), you will need +`gcc`, `flex` and `make`. + +* Clone this repository: `git clone https://github.com/Phyks/arxiv_metadata`. +* Init submodules (`opendetex`): `git submodule init; git submodule update`. +* Build `opendetex`: `cd opendetex; make`. +* You are ready to go. + +## Usage + +`./main.py some_file.bbl` to get a list of DOIs associated to each `\bibitem`. diff --git a/main.py b/main.py new file mode 100755 index 0000000..94914f4 --- /dev/null +++ b/main.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +import math +import os +import re +import requests +import subprocess +import sys + + +def oa_from_doi(doi): + """ + Get an OA version for a given DOI. + """ + # http://beta.dissem.in/api/10.1088/1367-2630/17/9/093036 + pass + + +def clean_bibitem(bibitem): + """ + Return a plaintext representation of the bibitem from the bbl file. + """ + script_dir = os.path.dirname(os.path.abspath(__file__)) + output = subprocess.check_output([script_dir + "/opendetex/delatex", "-s"], + input=bibitem.encode("utf-8")) + output = output.decode("utf-8") + output = ' '.join(output.strip().rstrip(".,").split()) + return output + + +def parse_bbl(bbl_file): + with open(bbl_file, 'r') as fh: + bbl_content = fh.read() + bibitems = re.split(r"\\bibitem\{.+?\}", bbl_content)[1:] + bibitems = [re.sub(r"\\end\{thebibliography}", + "", + i).strip() for i in bibitems] + cleaned_bbl = [] + for bibitem in bibitems: + cleaned_bbl.append(clean_bibitem(bibitem)) + return cleaned_bbl + + +def dois_from_bbl(bbl_file): + """ + Get the papers cited by the paper identified by the given DOI. + """ + cleaned_citations = parse_bbl(bbl_file) + dois = {} + for i in range(math.ceil(len(cleaned_citations) / 10)): + lower_bound = 10 * i + upper_bound = min(10 * (i + 1), len(cleaned_citations)) + r = requests.post("http://search.crossref.org/links", + json=cleaned_citations[lower_bound:upper_bound]) + for result in r.json()["results"]: + if "doi" not in result: + dois[result["text"]] = None + else: + dois[result["text"]] = result["doi"] + return dois + + +if __name__ == "__main__": + if len(sys.argv) < 2: + sys.exit("Usage: " + sys.argv[0] + " BBL_FILE.") + + import pprint + pprint.pprint(dois_from_bbl(sys.argv[1])) diff --git a/opendetex b/opendetex new file mode 160000 index 0000000..284f592 --- /dev/null +++ b/opendetex @@ -0,0 +1 @@ +Subproject commit 284f59211829aab75e8f07423da195bc630146c2 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..576b504 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +requests>=2.4.2