First commit
This commit is contained in:
commit
9ab4141d60
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
[submodule "opendetex"]
|
||||||
|
path = opendetex
|
||||||
|
url = https://github.com/Phyks/opendetex
|
16
README.md
Normal file
16
README.md
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
Metadata for arXiv
|
||||||
|
==================
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
For building `opendetex` (which is a necessary dependency), you will need
|
||||||
|
`gcc`, `flex` and `make`.
|
||||||
|
|
||||||
|
* Clone this repository: `git clone https://github.com/Phyks/arxiv_metadata`.
|
||||||
|
* Init submodules (`opendetex`): `git submodule init; git submodule update`.
|
||||||
|
* Build `opendetex`: `cd opendetex; make`.
|
||||||
|
* You are ready to go.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
`./main.py some_file.bbl` to get a list of DOIs associated to each `\bibitem`.
|
67
main.py
Executable file
67
main.py
Executable file
@ -0,0 +1,67 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def oa_from_doi(doi):
|
||||||
|
"""
|
||||||
|
Get an OA version for a given DOI.
|
||||||
|
"""
|
||||||
|
# http://beta.dissem.in/api/10.1088/1367-2630/17/9/093036
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def clean_bibitem(bibitem):
|
||||||
|
"""
|
||||||
|
Return a plaintext representation of the bibitem from the bbl file.
|
||||||
|
"""
|
||||||
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
output = subprocess.check_output([script_dir + "/opendetex/delatex", "-s"],
|
||||||
|
input=bibitem.encode("utf-8"))
|
||||||
|
output = output.decode("utf-8")
|
||||||
|
output = ' '.join(output.strip().rstrip(".,").split())
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def parse_bbl(bbl_file):
|
||||||
|
with open(bbl_file, 'r') as fh:
|
||||||
|
bbl_content = fh.read()
|
||||||
|
bibitems = re.split(r"\\bibitem\{.+?\}", bbl_content)[1:]
|
||||||
|
bibitems = [re.sub(r"\\end\{thebibliography}",
|
||||||
|
"",
|
||||||
|
i).strip() for i in bibitems]
|
||||||
|
cleaned_bbl = []
|
||||||
|
for bibitem in bibitems:
|
||||||
|
cleaned_bbl.append(clean_bibitem(bibitem))
|
||||||
|
return cleaned_bbl
|
||||||
|
|
||||||
|
|
||||||
|
def dois_from_bbl(bbl_file):
|
||||||
|
"""
|
||||||
|
Get the papers cited by the paper identified by the given DOI.
|
||||||
|
"""
|
||||||
|
cleaned_citations = parse_bbl(bbl_file)
|
||||||
|
dois = {}
|
||||||
|
for i in range(math.ceil(len(cleaned_citations) / 10)):
|
||||||
|
lower_bound = 10 * i
|
||||||
|
upper_bound = min(10 * (i + 1), len(cleaned_citations))
|
||||||
|
r = requests.post("http://search.crossref.org/links",
|
||||||
|
json=cleaned_citations[lower_bound:upper_bound])
|
||||||
|
for result in r.json()["results"]:
|
||||||
|
if "doi" not in result:
|
||||||
|
dois[result["text"]] = None
|
||||||
|
else:
|
||||||
|
dois[result["text"]] = result["doi"]
|
||||||
|
return dois
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
sys.exit("Usage: " + sys.argv[0] + " BBL_FILE.")
|
||||||
|
|
||||||
|
import pprint
|
||||||
|
pprint.pprint(dois_from_bbl(sys.argv[1]))
|
1
opendetex
Submodule
1
opendetex
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 284f59211829aab75e8f07423da195bc630146c2
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
requests>=2.4.2
|
Loading…
Reference in New Issue
Block a user