arxiv_metadata/reference_fetcher/arxiv.py

"""
This file contains all the arXiv-specific functions.
"""
import io
import requests
import tarfile
import xml.etree.ElementTree

from . import bbl


def sources_from_arxiv(eprint):
    """
    Download sources on arXiv for a given preprint.

    Params:
        - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).

    Returns a TarFile object of the sources of the arXiv preprint.
    """
    r = requests.get("http://arxiv.org/e-print/%s" % (eprint,))
    file_object = io.BytesIO(r.content)
    return tarfile.open(fileobj=file_object)


def bbl_from_arxiv(eprint):
    """
    Get the .bbl files (if any) of a given preprint.

    Params:
        - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).

    Returns a list of the .bbl files as text (if any) or None.
    """
    tf = sources_from_arxiv(eprint)
    bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
    bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
                 for member in bbl_files]
    return bbl_files


def get_cited_dois(eprint):
    """
    Get the .bbl files (if any) of a given preprint.

    Params:
        - eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).

    Returns a dict of cleaned plaintext citations and their associated doi.
    """
    bbl_files = bbl_from_arxiv(eprint)
    dois = {}
    for bbl_file in bbl_files:
        dois.update(bbl.get_dois(bbl_file))
    return dois


def get_arxiv_eprint_from_doi(doi):
    """
    Get the arXiv eprint id for a given DOI.

    Params:
        - doi is the DOI of the resource to look for.

    Returns the arXiv eprint id, or None if not found.
    """
    r = requests.get("http://export.arxiv.org/api/query",
                     params={
                         "search_query": "doi:%s" % (doi,),
                         "max_results": 1
                     })
    e = xml.etree.ElementTree.fromstring(r.content)
    for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
        id = entry.find("{http://www.w3.org/2005/Atom}id").text
        return id.replace("http://arxiv.org/abs/", "")
    return None


def get_doi(eprint):
    """
    Get the associated DOI for a given arXiv eprint.

    Params:
        - eprint is the arXiv eprint id.

    Returns the DOI if any, or None.
    """
    r = requests.get("http://export.arxiv.org/api/query",
                     params={
                         "id_list": eprint,
                         "max_results": 1
                     })
    e = xml.etree.ElementTree.fromstring(r.content)
    for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
        doi = entry.find("{http://arxiv.org/schemas/atom}doi")
        if doi is not None:
            return doi.text
    return None
Add a function to look for an OA version for a given DOI 2015-12-23 23:03:40 +01:00			`"""`
			`This file contains all the arXiv-specific functions.`
			`"""`
Clean a bit the code 2015-12-23 22:49:14 +01:00			`import io`
			`import requests`
			`import tarfile`
Basic API to put and fetch some papers 2015-12-24 20:34:34 +01:00			`import xml.etree.ElementTree`
Clean a bit the code 2015-12-23 22:49:14 +01:00
Move reference fetching code into a sub-library 2015-12-23 23:46:37 +01:00			`from . import bbl`

Clean a bit the code 2015-12-23 22:49:14 +01:00
			`def sources_from_arxiv(eprint):`
			`"""`
			`Download sources on arXiv for a given preprint.`

			`Params:`
			`- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).`

			`Returns a TarFile object of the sources of the arXiv preprint.`
			`"""`
			`r = requests.get("http://arxiv.org/e-print/%s" % (eprint,))`
			`file_object = io.BytesIO(r.content)`
			`return tarfile.open(fileobj=file_object)`


			`def bbl_from_arxiv(eprint):`
			`"""`
			`Get the .bbl files (if any) of a given preprint.`

			`Params:`
			`- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).`

			`Returns a list of the .bbl files as text (if any) or None.`
			`"""`
			`tf = sources_from_arxiv(eprint)`
			`bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]`
			`bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)`
			`for member in bbl_files]`
			`return bbl_files`


Basic API to put and fetch some papers 2015-12-24 20:34:34 +01:00			`def get_cited_dois(eprint):`
Clean a bit the code 2015-12-23 22:49:14 +01:00			`"""`
			`Get the .bbl files (if any) of a given preprint.`

			`Params:`
			`- eprint is the arXiv id (e.g. 1401.2910 or 1401.2910v1).`

			`Returns a dict of cleaned plaintext citations and their associated doi.`
			`"""`
			`bbl_files = bbl_from_arxiv(eprint)`
			`dois = {}`
			`for bbl_file in bbl_files:`
			`dois.update(bbl.get_dois(bbl_file))`
			`return dois`
Basic API to put and fetch some papers 2015-12-24 20:34:34 +01:00

			`def get_arxiv_eprint_from_doi(doi):`
			`"""`
			`Get the arXiv eprint id for a given DOI.`

			`Params:`
			`- doi is the DOI of the resource to look for.`

			`Returns the arXiv eprint id, or None if not found.`
			`"""`
			`r = requests.get("http://export.arxiv.org/api/query",`
			`params={`
			`"search_query": "doi:%s" % (doi,),`
			`"max_results": 1`
			`})`
			`e = xml.etree.ElementTree.fromstring(r.content)`
			`for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):`
			`id = entry.find("{http://www.w3.org/2005/Atom}id").text`
			`return id.replace("http://arxiv.org/abs/", "")`
			`return None`


			`def get_doi(eprint):`
			`"""`
			`Get the associated DOI for a given arXiv eprint.`

			`Params:`
			`- eprint is the arXiv eprint id.`

			`Returns the DOI if any, or None.`
			`"""`
			`r = requests.get("http://export.arxiv.org/api/query",`
			`params={`
			`"id_list": eprint,`
			`"max_results": 1`
			`})`
			`e = xml.etree.ElementTree.fromstring(r.content)`
			`for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):`
			`doi = entry.find("{http://arxiv.org/schemas/atom}doi")`
			`if doi is not None:`
			`return doi.text`
			`return None`