407 lines
9.7 KiB
Python
407 lines
9.7 KiB
Python
"""
|
|
This file contains all the arXiv-related functions.
|
|
"""
|
|
import arxiv2bib
|
|
import bibtexparser
|
|
import io
|
|
import re
|
|
import requests
|
|
import tarfile
|
|
import xml.etree.ElementTree
|
|
|
|
from urllib.error import HTTPError
|
|
from requests.exception import RequestException
|
|
|
|
|
|
from libbmc import tools
|
|
from libbmc.citations import bbl
|
|
|
|
|
|
ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
|
|
ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([
|
|
"astro-ph.GA",
|
|
"astro-ph.CO",
|
|
"astro-ph.EP",
|
|
"astro-ph.HE",
|
|
"astro-ph.IM",
|
|
"astro-ph.SR",
|
|
"cond-math.dis-nn",
|
|
"cond-math.mtrl-sci",
|
|
"cond-math.mes-hall",
|
|
"cond-math.other",
|
|
"cond-math.quant-gas",
|
|
"cond-math.soft",
|
|
"cond-math.stat-mech",
|
|
"cond-math.str-el",
|
|
"cond-math.supr-con",
|
|
"gr-qc",
|
|
"hep-ex",
|
|
"hep-lat",
|
|
"hep-ph",
|
|
"hep-th",
|
|
"math-ph",
|
|
"nlin.AO",
|
|
"nlin.CG",
|
|
"nlin.CD",
|
|
"nlin.SI",
|
|
"nlin.PS",
|
|
"nucl-ex",
|
|
"nucl-th",
|
|
"physics.acc-ph",
|
|
"physics.ao-ph",
|
|
"physics.atom-ph",
|
|
"physics.atm-clus",
|
|
"physics.bio-ph",
|
|
"physics.chem-ph",
|
|
"physics.class-ph",
|
|
"physics.comp-ph",
|
|
"physics.data-an",
|
|
"physics.flu-dyn",
|
|
"physics.gen-ph",
|
|
"physics.geo-ph",
|
|
"physics.hist-ph",
|
|
"physics.ins-det",
|
|
"physics.med-ph",
|
|
"physics.optics",
|
|
"physics.ed-ph",
|
|
"physics.soc-ph",
|
|
"physics.plasm-ph",
|
|
"physics.pop-ph",
|
|
"physics.space-ph",
|
|
"physics.quant-ph",
|
|
"math.AG",
|
|
"math.AT",
|
|
"math.AP",
|
|
"math.CT",
|
|
"math.CA",
|
|
"math.CO",
|
|
"math.AC",
|
|
"math.CV",
|
|
"math.DG",
|
|
"math.DS",
|
|
"math.FA",
|
|
"math.GM",
|
|
"math.GN",
|
|
"math.GT",
|
|
"math.GR",
|
|
"math.HO",
|
|
"math.IT",
|
|
"math.KT",
|
|
"math.LO",
|
|
"math.MP",
|
|
"math.MG",
|
|
"math.NT",
|
|
"math.NA",
|
|
"math.OA",
|
|
"math.OC",
|
|
"math.PR",
|
|
"math.QA",
|
|
"math.RT",
|
|
"math.RA",
|
|
"math.SP",
|
|
"math.ST",
|
|
"math.SG",
|
|
"cs.AI",
|
|
"cs.CL",
|
|
"cs.CC",
|
|
"cs.CE",
|
|
"cs.CG",
|
|
"cs.GT",
|
|
"cs.CV",
|
|
"cs.CY",
|
|
"cs.CR",
|
|
"cs.DS",
|
|
"cs.DB",
|
|
"cs.DL",
|
|
"cs.DM",
|
|
"cs.DC",
|
|
"cs.ET",
|
|
"cs.FL",
|
|
"cs.GL",
|
|
"cs.GR",
|
|
"cs.AR",
|
|
"cs.HC",
|
|
"cs.IR",
|
|
"cs.IT",
|
|
"cs.LG",
|
|
"cs.LO",
|
|
"cs.MS",
|
|
"cs.MA",
|
|
"cs.MM",
|
|
"cs.NI",
|
|
"cs.NE",
|
|
"cs.NA",
|
|
"cs.OS",
|
|
"cs.OH",
|
|
"cs.PF",
|
|
"cs.PL",
|
|
"cs.RO",
|
|
"cs.SI",
|
|
"cs.SE",
|
|
"cs.SD",
|
|
"cs.SC",
|
|
"cs.SY",
|
|
"q-bio.BM",
|
|
"q-bio.CB",
|
|
"q-bio.GN",
|
|
"q-bio.MN",
|
|
"q-bio.NC",
|
|
"q-bio.OT",
|
|
"q-bio.PE",
|
|
"q-bio.QM",
|
|
"q-bio.SC",
|
|
"q-bio.TO",
|
|
"q-fin.CP",
|
|
"q-fin.EC",
|
|
"q-fin.GN",
|
|
"q-fin.MF",
|
|
"q-fin.PM",
|
|
"q-fin.PR",
|
|
"q-fin.RM",
|
|
"q-fin.ST",
|
|
"q-fin.TR",
|
|
"stat.AP",
|
|
"stat.CO",
|
|
"stat.ML",
|
|
"stat.ME",
|
|
"stat.OT",
|
|
"stat.TH"])) + r")/\d+"
|
|
REGEX = re.compile(
|
|
"(" + ARXIV_IDENTIFIER_FROM_2007 + ")|(" +
|
|
ARXIV_IDENTIFIER_BEFORE_2007 + ")",
|
|
re.IGNORECASE)
|
|
|
|
# Base arXiv URL used as id sometimes
|
|
ARXIV_URL = "http://arxiv.org/abs/{arxiv_id}"
|
|
# Eprint URL used to download sources
|
|
ARXIV_EPRINT_URL = "http://arxiv.org/e-print/{arxiv_id}"
|
|
|
|
|
|
def get_latest_version(arxiv_id):
|
|
"""
|
|
Find the latest version of a given arXiv eprint.
|
|
|
|
:param arxiv_id: The arXiv ID to query.
|
|
:returns: The latest version on eprint as a string, or ``None``.
|
|
"""
|
|
# Get updated bibtex
|
|
# Trick: strip the version from the arXiv id, to query updated BibTeX for
|
|
# the preprint and not the specific version
|
|
arxiv_preprint_id = strip_version(arxiv_id)
|
|
updated_bibtex = bibtexparser.loads(get_bibtex(arxiv_preprint_id))
|
|
updated_bibtex = next(updated_bibtex.entries_dict)
|
|
|
|
try:
|
|
return updated_bibtex["eprint"]
|
|
except KeyError:
|
|
return None
|
|
|
|
|
|
def strip_version(arxiv_id):
|
|
"""
|
|
Remove the version suffix from an arXiv id.
|
|
|
|
:param arxiv_id: The arXiv ID to strip.
|
|
:returns: The arXiv ID without the suffix version
|
|
"""
|
|
return re.sub(r"v\d+\Z", '', arxiv_id)
|
|
|
|
|
|
def is_valid(arxiv_id):
|
|
"""
|
|
Check that a given arXiv ID is a valid one.
|
|
|
|
:param arxiv_id: The arXiv ID to be checked.
|
|
:returns: Boolean indicating whether the arXiv ID is valid or not.
|
|
"""
|
|
match = REGEX.match(arxiv_id)
|
|
return ((match is not None) and (match.group(0) == arxiv_id))
|
|
|
|
|
|
def get_bibtex(arxiv_id):
|
|
"""
|
|
Get a BibTeX entry for a given arXiv ID.
|
|
|
|
.. note::
|
|
|
|
Using awesome https://pypi.python.org/pypi/arxiv2bib/ module.
|
|
|
|
:param arxiv_id: The canonical arXiv id to get BibTeX from.
|
|
:returns: A BibTeX string or ``None``.
|
|
"""
|
|
# Fetch bibtex using arxiv2bib module
|
|
try:
|
|
bibtex = arxiv2bib.arxiv2bib([arxiv_id])
|
|
except HTTPError:
|
|
bibtex = []
|
|
|
|
for bib in bibtex:
|
|
if isinstance(bib, arxiv2bib.ReferenceErrorInfo):
|
|
continue
|
|
else:
|
|
# Return fetched bibtex
|
|
return bib.bibtex()
|
|
# An error occurred, return None
|
|
return None
|
|
|
|
|
|
def extract_from_text(text):
|
|
"""
|
|
Extract arXiv IDs from a text.
|
|
|
|
:param text: The text to extract arXiv IDs from.
|
|
:returns: A list of matching arXiv IDs.
|
|
"""
|
|
return tools.remove_duplicates(REGEX.findall(text))
|
|
|
|
|
|
def to_URL(arxiv_ids):
|
|
"""
|
|
Convert a list of canonical DOIs to a list of DOIs URLs.
|
|
|
|
:param dois: List of canonical DOIs.
|
|
:returns: A list of DOIs URLs.
|
|
"""
|
|
if isinstance(arxiv_ids, list):
|
|
return [ARXIV_URL.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]
|
|
else:
|
|
return ARXIV_URL.format(arxiv_id=arxiv_ids)
|
|
|
|
|
|
def to_canonical(urls):
|
|
"""
|
|
Convert a list of DOIs URLs to a list of canonical DOIs.
|
|
|
|
:param dois: A list of DOIs URLs.
|
|
:returns: List of canonical DOIs.
|
|
"""
|
|
if isinstance(urls, list):
|
|
return [extract_from_text(url) for url in urls]
|
|
else:
|
|
return extract_from_text(urls)
|
|
|
|
|
|
def from_DOI(doi):
|
|
"""
|
|
Get the arXiv eprint id for a given DOI.
|
|
|
|
.. note::
|
|
|
|
Uses arXiv API. Will not return anything if arXiv is not aware of the
|
|
associated DOI.
|
|
|
|
:param doi: The DOI of the resource to look for.
|
|
:returns: The arXiv eprint id, or ``None`` if not found.
|
|
"""
|
|
try:
|
|
r = requests.get("http://export.arxiv.org/api/query",
|
|
params={
|
|
"search_query": "doi:%s" % (doi,),
|
|
"max_results": 1
|
|
})
|
|
r.raise_for_status()
|
|
except RequestException:
|
|
return None
|
|
e = xml.etree.ElementTree.fromstring(r.content)
|
|
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
|
id = entry.find("{http://www.w3.org/2005/Atom}id").text
|
|
# id is an arXiv full URL. We only want the id which is the last URL
|
|
# component.
|
|
return id.split("/")[-1]
|
|
return None
|
|
|
|
|
|
def to_DOI(arxiv_id):
|
|
"""
|
|
Get the associated DOI for a given arXiv eprint.
|
|
|
|
.. note::
|
|
|
|
Uses arXiv API. Will not return anything if arXiv is not aware of the
|
|
associated DOI.
|
|
|
|
:param eprint: The arXiv eprint id.
|
|
:returns: The DOI if any, or ``None``.
|
|
"""
|
|
try:
|
|
r = requests.get("http://export.arxiv.org/api/query",
|
|
params={
|
|
"id_list": arxiv_id,
|
|
"max_results": 1
|
|
})
|
|
r.raise_for_status()
|
|
except RequestException:
|
|
return None
|
|
e = xml.etree.ElementTree.fromstring(r.content)
|
|
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
|
doi = entry.find("{http://arxiv.org/schemas/atom}doi")
|
|
if doi is not None:
|
|
return doi.text
|
|
return None
|
|
|
|
|
|
def get_sources(arxiv_id):
|
|
"""
|
|
Download sources on arXiv for a given preprint.
|
|
|
|
.. note::
|
|
|
|
Bulk download of sources from arXiv is not permitted by their API. \
|
|
You should have a look at http://arxiv.org/help/bulk_data_s3.
|
|
|
|
:param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \
|
|
canonical form.
|
|
:returns: A ``TarFile`` object of the sources of the arXiv preprint or \
|
|
``None``.
|
|
"""
|
|
try:
|
|
r = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))
|
|
r.raise_for_status()
|
|
file_object = io.BytesIO(r.content)
|
|
return tarfile.open(fileobj=file_object)
|
|
except (RequestException, AssertionError, tarfile.TarError):
|
|
return None
|
|
|
|
|
|
def get_bbl(arxiv_id):
|
|
"""
|
|
Get the .bbl files (if any) of a given preprint.
|
|
|
|
.. note::
|
|
|
|
Bulk download of sources from arXiv is not permitted by their API. \
|
|
You should have a look at http://arxiv.org/help/bulk_data_s3.
|
|
|
|
:param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
|
|
a canonical form.
|
|
:returns: A list of the full text of the ``.bbl`` files (if any) \
|
|
or ``None``.
|
|
"""
|
|
tf = get_sources(arxiv_id)
|
|
bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
|
|
bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
|
|
for member in bbl_files]
|
|
return bbl_files
|
|
|
|
|
|
def get_citations(arxiv_id):
|
|
"""
|
|
Get the DOIs cited by a given preprint.
|
|
|
|
.. note::
|
|
|
|
Bulk download of sources from arXiv is not permitted by their API. \
|
|
You should have a look at http://arxiv.org/help/bulk_data_s3.
|
|
|
|
:param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
|
|
a canonical form.
|
|
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
|
"""
|
|
dois = {}
|
|
# Get the list of bbl files for this preprint
|
|
bbl_files = get_bbl(arxiv_id)
|
|
for bbl_file in bbl_files:
|
|
# Fetch the cited DOIs for each of the bbl files
|
|
dois.update(bbl.get_cited_DOIs(bbl_file))
|
|
return dois
|