libbmc/libbmc/repositories/arxiv.py

"""
This file contains all the arXiv-related functions.
"""
import arxiv2bib
import bibtexparser
import io
import re
import requests
import tarfile
import xml.etree.ElementTree

from urllib.error import HTTPError
from requests.exception import RequestException


from libbmc import tools
from libbmc.citations import bbl


ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([
    "astro-ph.GA",
    "astro-ph.CO",
    "astro-ph.EP",
    "astro-ph.HE",
    "astro-ph.IM",
    "astro-ph.SR",
    "cond-math.dis-nn",
    "cond-math.mtrl-sci",
    "cond-math.mes-hall",
    "cond-math.other",
    "cond-math.quant-gas",
    "cond-math.soft",
    "cond-math.stat-mech",
    "cond-math.str-el",
    "cond-math.supr-con",
    "gr-qc",
    "hep-ex",
    "hep-lat",
    "hep-ph",
    "hep-th",
    "math-ph",
    "nlin.AO",
    "nlin.CG",
    "nlin.CD",
    "nlin.SI",
    "nlin.PS",
    "nucl-ex",
    "nucl-th",
    "physics.acc-ph",
    "physics.ao-ph",
    "physics.atom-ph",
    "physics.atm-clus",
    "physics.bio-ph",
    "physics.chem-ph",
    "physics.class-ph",
    "physics.comp-ph",
    "physics.data-an",
    "physics.flu-dyn",
    "physics.gen-ph",
    "physics.geo-ph",
    "physics.hist-ph",
    "physics.ins-det",
    "physics.med-ph",
    "physics.optics",
    "physics.ed-ph",
    "physics.soc-ph",
    "physics.plasm-ph",
    "physics.pop-ph",
    "physics.space-ph",
    "physics.quant-ph",
    "math.AG",
    "math.AT",
    "math.AP",
    "math.CT",
    "math.CA",
    "math.CO",
    "math.AC",
    "math.CV",
    "math.DG",
    "math.DS",
    "math.FA",
    "math.GM",
    "math.GN",
    "math.GT",
    "math.GR",
    "math.HO",
    "math.IT",
    "math.KT",
    "math.LO",
    "math.MP",
    "math.MG",
    "math.NT",
    "math.NA",
    "math.OA",
    "math.OC",
    "math.PR",
    "math.QA",
    "math.RT",
    "math.RA",
    "math.SP",
    "math.ST",
    "math.SG",
    "cs.AI",
    "cs.CL",
    "cs.CC",
    "cs.CE",
    "cs.CG",
    "cs.GT",
    "cs.CV",
    "cs.CY",
    "cs.CR",
    "cs.DS",
    "cs.DB",
    "cs.DL",
    "cs.DM",
    "cs.DC",
    "cs.ET",
    "cs.FL",
    "cs.GL",
    "cs.GR",
    "cs.AR",
    "cs.HC",
    "cs.IR",
    "cs.IT",
    "cs.LG",
    "cs.LO",
    "cs.MS",
    "cs.MA",
    "cs.MM",
    "cs.NI",
    "cs.NE",
    "cs.NA",
    "cs.OS",
    "cs.OH",
    "cs.PF",
    "cs.PL",
    "cs.RO",
    "cs.SI",
    "cs.SE",
    "cs.SD",
    "cs.SC",
    "cs.SY",
    "q-bio.BM",
    "q-bio.CB",
    "q-bio.GN",
    "q-bio.MN",
    "q-bio.NC",
    "q-bio.OT",
    "q-bio.PE",
    "q-bio.QM",
    "q-bio.SC",
    "q-bio.TO",
    "q-fin.CP",
    "q-fin.EC",
    "q-fin.GN",
    "q-fin.MF",
    "q-fin.PM",
    "q-fin.PR",
    "q-fin.RM",
    "q-fin.ST",
    "q-fin.TR",
    "stat.AP",
    "stat.CO",
    "stat.ML",
    "stat.ME",
    "stat.OT",
    "stat.TH"])) + r")/\d+"
REGEX = re.compile(
    "(" + ARXIV_IDENTIFIER_FROM_2007 + ")|(" +
    ARXIV_IDENTIFIER_BEFORE_2007 + ")",
    re.IGNORECASE)

# Base arXiv URL used as id sometimes
ARXIV_URL = "http://arxiv.org/abs/{arxiv_id}"
# Eprint URL used to download sources
ARXIV_EPRINT_URL = "http://arxiv.org/e-print/{arxiv_id}"


def get_latest_version(arxiv_id):
    """
    Find the latest version of a given arXiv eprint.

    :param arxiv_id: The arXiv ID to query.
    :returns: The latest version on eprint as a string, or ``None``.
    """
    # Get updated bibtex
    # Trick: strip the version from the arXiv id, to query updated BibTeX for
    # the preprint and not the specific version
    arxiv_preprint_id = strip_version(arxiv_id)
    updated_bibtex = bibtexparser.loads(get_bibtex(arxiv_preprint_id))
    updated_bibtex = next(updated_bibtex.entries_dict)

    try:
        return updated_bibtex["eprint"]
    except KeyError:
        return None


def strip_version(arxiv_id):
    """
    Remove the version suffix from an arXiv id.

    :param arxiv_id: The arXiv ID to strip.
    :returns: The arXiv ID without the suffix version
    """
    return re.sub(r"v\d+\Z", '', arxiv_id)


def is_valid(arxiv_id):
    """
    Check that a given arXiv ID is a valid one.

    :param arxiv_id: The arXiv ID to be checked.
    :returns: Boolean indicating whether the arXiv ID is valid or not.
    """
    match = REGEX.match(arxiv_id)
    return ((match is not None) and (match.group(0) == arxiv_id))


def get_bibtex(arxiv_id):
    """
    Get a BibTeX entry for a given arXiv ID.

    .. note::

        Using awesome https://pypi.python.org/pypi/arxiv2bib/ module.

    :param arxiv_id: The canonical arXiv id to get BibTeX from.
    :returns: A BibTeX string or ``None``.
    """
    # Fetch bibtex using arxiv2bib module
    try:
        bibtex = arxiv2bib.arxiv2bib([arxiv_id])
    except HTTPError:
        bibtex = []

    for bib in bibtex:
        if isinstance(bib, arxiv2bib.ReferenceErrorInfo):
            continue
        else:
            # Return fetched bibtex
            return bib.bibtex()
    # An error occurred, return None
    return None


def extract_from_text(text):
    """
    Extract arXiv IDs from a text.

    :param text: The text to extract arXiv IDs from.
    :returns: A list of matching arXiv IDs.
    """
    return tools.remove_duplicates(REGEX.findall(text))


def to_URL(arxiv_ids):
    """
    Convert a list of canonical DOIs to a list of DOIs URLs.

    :param dois: List of canonical DOIs.
    :returns: A list of DOIs URLs.
    """
    if isinstance(arxiv_ids, list):
        return [ARXIV_URL.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]
    else:
        return ARXIV_URL.format(arxiv_id=arxiv_ids)


def to_canonical(urls):
    """
    Convert a list of DOIs URLs to a list of canonical DOIs.

    :param dois: A list of DOIs URLs.
    :returns: List of canonical DOIs.
    """
    if isinstance(urls, list):
        return [extract_from_text(url) for url in urls]
    else:
        return extract_from_text(urls)


def from_DOI(doi):
    """
    Get the arXiv eprint id for a given DOI.

    .. note::

        Uses arXiv API. Will not return anything if arXiv is not aware of the
        associated DOI.

    :param doi: The DOI of the resource to look for.
    :returns: The arXiv eprint id, or ``None`` if not found.
    """
    try:
        r = requests.get("http://export.arxiv.org/api/query",
                         params={
                             "search_query": "doi:%s" % (doi,),
                             "max_results": 1
                         })
        r.raise_for_status()
    except RequestException:
        return None
    e = xml.etree.ElementTree.fromstring(r.content)
    for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
        id = entry.find("{http://www.w3.org/2005/Atom}id").text
        # id is an arXiv full URL. We only want the id which is the last URL
        # component.
        return id.split("/")[-1]
    return None


def to_DOI(arxiv_id):
    """
    Get the associated DOI for a given arXiv eprint.

    .. note::

        Uses arXiv API. Will not return anything if arXiv is not aware of the
        associated DOI.

    :param eprint: The arXiv eprint id.
    :returns: The DOI if any, or ``None``.
    """
    try:
        r = requests.get("http://export.arxiv.org/api/query",
                         params={
                             "id_list": arxiv_id,
                             "max_results": 1
                         })
        r.raise_for_status()
    except RequestException:
        return None
    e = xml.etree.ElementTree.fromstring(r.content)
    for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
        doi = entry.find("{http://arxiv.org/schemas/atom}doi")
        if doi is not None:
            return doi.text
    return None


def get_sources(arxiv_id):
    """
    Download sources on arXiv for a given preprint.

    .. note::

        Bulk download of sources from arXiv is not permitted by their API. \
                You should have a look at http://arxiv.org/help/bulk_data_s3.

    :param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \
            canonical form.
    :returns: A ``TarFile`` object of the sources of the arXiv preprint or \
            ``None``.
    """
    try:
        r = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))
        r.raise_for_status()
        file_object = io.BytesIO(r.content)
        return tarfile.open(fileobj=file_object)
    except (RequestException, AssertionError, tarfile.TarError):
        return None


def get_bbl(arxiv_id):
    """
    Get the .bbl files (if any) of a given preprint.

    .. note::

        Bulk download of sources from arXiv is not permitted by their API. \
                You should have a look at http://arxiv.org/help/bulk_data_s3.

    :param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
            a canonical form.
    :returns: A list of the full text of the ``.bbl`` files (if any) \
            or ``None``.
    """
    tf = get_sources(arxiv_id)
    bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
    bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
                 for member in bbl_files]
    return bbl_files


def get_citations(arxiv_id):
    """
    Get the DOIs cited by a given preprint.

    .. note::

        Bulk download of sources from arXiv is not permitted by their API. \
                You should have a look at http://arxiv.org/help/bulk_data_s3.

    :param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
            a canonical form.
    :returns: A dict of cleaned plaintext citations and their associated DOI.
    """
    dois = {}
    # Get the list of bbl files for this preprint
    bbl_files = get_bbl(arxiv_id)
    for bbl_file in bbl_files:
        # Fetch the cited DOIs for each of the bbl files
        dois.update(bbl.get_cited_DOIs(bbl_file))
    return dois
First commit 2015-12-27 19:35:55 +01:00			`"""`
			`This file contains all the arXiv-related functions.`
			`"""`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`import arxiv2bib`
Add a function to look for updated arXiv versions. 2016-01-14 00:04:33 +01:00			`import bibtexparser`
First commit 2015-12-27 19:35:55 +01:00			`import io`
			`import re`
			`import requests`
			`import tarfile`
			`import xml.etree.ElementTree`

Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`from urllib.error import HTTPError`
			`from requests.exception import RequestException`


First commit 2015-12-27 19:35:55 +01:00			`from libbmc import tools`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`from libbmc.citations import bbl`
First commit 2015-12-27 19:35:55 +01:00

Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?"`
			`ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("\|".join([`
First commit 2015-12-27 19:35:55 +01:00			`"astro-ph.GA",`
			`"astro-ph.CO",`
			`"astro-ph.EP",`
			`"astro-ph.HE",`
			`"astro-ph.IM",`
			`"astro-ph.SR",`
			`"cond-math.dis-nn",`
			`"cond-math.mtrl-sci",`
			`"cond-math.mes-hall",`
			`"cond-math.other",`
			`"cond-math.quant-gas",`
			`"cond-math.soft",`
			`"cond-math.stat-mech",`
			`"cond-math.str-el",`
			`"cond-math.supr-con",`
			`"gr-qc",`
			`"hep-ex",`
			`"hep-lat",`
			`"hep-ph",`
			`"hep-th",`
			`"math-ph",`
			`"nlin.AO",`
			`"nlin.CG",`
			`"nlin.CD",`
			`"nlin.SI",`
			`"nlin.PS",`
			`"nucl-ex",`
			`"nucl-th",`
			`"physics.acc-ph",`
			`"physics.ao-ph",`
			`"physics.atom-ph",`
			`"physics.atm-clus",`
			`"physics.bio-ph",`
			`"physics.chem-ph",`
			`"physics.class-ph",`
			`"physics.comp-ph",`
			`"physics.data-an",`
			`"physics.flu-dyn",`
			`"physics.gen-ph",`
			`"physics.geo-ph",`
			`"physics.hist-ph",`
			`"physics.ins-det",`
			`"physics.med-ph",`
			`"physics.optics",`
			`"physics.ed-ph",`
			`"physics.soc-ph",`
			`"physics.plasm-ph",`
			`"physics.pop-ph",`
			`"physics.space-ph",`
			`"physics.quant-ph",`
			`"math.AG",`
			`"math.AT",`
			`"math.AP",`
			`"math.CT",`
			`"math.CA",`
			`"math.CO",`
			`"math.AC",`
			`"math.CV",`
			`"math.DG",`
			`"math.DS",`
			`"math.FA",`
			`"math.GM",`
			`"math.GN",`
			`"math.GT",`
			`"math.GR",`
			`"math.HO",`
			`"math.IT",`
			`"math.KT",`
			`"math.LO",`
			`"math.MP",`
			`"math.MG",`
			`"math.NT",`
			`"math.NA",`
			`"math.OA",`
			`"math.OC",`
			`"math.PR",`
			`"math.QA",`
			`"math.RT",`
			`"math.RA",`
			`"math.SP",`
			`"math.ST",`
			`"math.SG",`
			`"cs.AI",`
			`"cs.CL",`
			`"cs.CC",`
			`"cs.CE",`
			`"cs.CG",`
			`"cs.GT",`
			`"cs.CV",`
			`"cs.CY",`
			`"cs.CR",`
			`"cs.DS",`
			`"cs.DB",`
			`"cs.DL",`
			`"cs.DM",`
			`"cs.DC",`
			`"cs.ET",`
			`"cs.FL",`
			`"cs.GL",`
			`"cs.GR",`
			`"cs.AR",`
			`"cs.HC",`
			`"cs.IR",`
			`"cs.IT",`
			`"cs.LG",`
			`"cs.LO",`
			`"cs.MS",`
			`"cs.MA",`
			`"cs.MM",`
			`"cs.NI",`
			`"cs.NE",`
			`"cs.NA",`
			`"cs.OS",`
			`"cs.OH",`
			`"cs.PF",`
			`"cs.PL",`
			`"cs.RO",`
			`"cs.SI",`
			`"cs.SE",`
			`"cs.SD",`
			`"cs.SC",`
			`"cs.SY",`
			`"q-bio.BM",`
			`"q-bio.CB",`
			`"q-bio.GN",`
			`"q-bio.MN",`
			`"q-bio.NC",`
			`"q-bio.OT",`
			`"q-bio.PE",`
			`"q-bio.QM",`
			`"q-bio.SC",`
			`"q-bio.TO",`
			`"q-fin.CP",`
			`"q-fin.EC",`
			`"q-fin.GN",`
			`"q-fin.MF",`
			`"q-fin.PM",`
			`"q-fin.PR",`
			`"q-fin.RM",`
			`"q-fin.ST",`
			`"q-fin.TR",`
			`"stat.AP",`
			`"stat.CO",`
			`"stat.ML",`
			`"stat.ME",`
			`"stat.OT",`
			`"stat.TH"])) + r")/\d+"`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`REGEX = re.compile(`
			`"(" + ARXIV_IDENTIFIER_FROM_2007 + ")\|(" +`
			`ARXIV_IDENTIFIER_BEFORE_2007 + ")",`
First commit 2015-12-27 19:35:55 +01:00			`re.IGNORECASE)`

			`# Base arXiv URL used as id sometimes`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`ARXIV_URL = "http://arxiv.org/abs/{arxiv_id}"`
First commit 2015-12-27 19:35:55 +01:00			`# Eprint URL used to download sources`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`ARXIV_EPRINT_URL = "http://arxiv.org/e-print/{arxiv_id}"`
First commit 2015-12-27 19:35:55 +01:00

Add a function to look for updated arXiv versions. 2016-01-14 00:04:33 +01:00			`def get_latest_version(arxiv_id):`
			`"""`
			`Find the latest version of a given arXiv eprint.`

			`:param arxiv_id: The arXiv ID to query.`
			:returns: The latest version on eprint as a string, or ``None``.
			`"""`
			`# Get updated bibtex`
			`# Trick: strip the version from the arXiv id, to query updated BibTeX for`
			`# the preprint and not the specific version`
			`arxiv_preprint_id = strip_version(arxiv_id)`
			`updated_bibtex = bibtexparser.loads(get_bibtex(arxiv_preprint_id))`
			`updated_bibtex = next(updated_bibtex.entries_dict)`

			`try:`
			`return updated_bibtex["eprint"]`
			`except KeyError:`
			`return None`


			`def strip_version(arxiv_id):`
			`"""`
			`Remove the version suffix from an arXiv id.`

			`:param arxiv_id: The arXiv ID to strip.`
			`:returns: The arXiv ID without the suffix version`
			`"""`
			`return re.sub(r"v\d+\Z", '', arxiv_id)`


First commit 2015-12-27 19:35:55 +01:00			`def is_valid(arxiv_id):`
			`"""`
			`Check that a given arXiv ID is a valid one.`

			`:param arxiv_id: The arXiv ID to be checked.`
			`:returns: Boolean indicating whether the arXiv ID is valid or not.`
			`"""`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`match = REGEX.match(arxiv_id)`
First commit 2015-12-27 19:35:55 +01:00			`return ((match is not None) and (match.group(0) == arxiv_id))`


			`def get_bibtex(arxiv_id):`
			`"""`
Add identifiers fetching from papers 2016-01-10 15:12:06 +01:00			`Get a BibTeX entry for a given arXiv ID.`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00
			`.. note::`

			`Using awesome https://pypi.python.org/pypi/arxiv2bib/ module.`

			`:param arxiv_id: The canonical arXiv id to get BibTeX from.`
			:returns: A BibTeX string or ``None``.
First commit 2015-12-27 19:35:55 +01:00			`"""`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`# Fetch bibtex using arxiv2bib module`
			`try:`
			`bibtex = arxiv2bib.arxiv2bib([arxiv_id])`
			`except HTTPError:`
			`bibtex = []`

			`for bib in bibtex:`
			`if isinstance(bib, arxiv2bib.ReferenceErrorInfo):`
			`continue`
			`else:`
			`# Return fetched bibtex`
			`return bib.bibtex()`
			`# An error occurred, return None`
			`return None`
First commit 2015-12-27 19:35:55 +01:00

			`def extract_from_text(text):`
			`"""`
			`Extract arXiv IDs from a text.`

			`:param text: The text to extract arXiv IDs from.`
			`:returns: A list of matching arXiv IDs.`
			`"""`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`return tools.remove_duplicates(REGEX.findall(text))`
First commit 2015-12-27 19:35:55 +01:00

			`def to_URL(arxiv_ids):`
			`"""`
			`Convert a list of canonical DOIs to a list of DOIs URLs.`

			`:param dois: List of canonical DOIs.`
			`:returns: A list of DOIs URLs.`
			`"""`
			`if isinstance(arxiv_ids, list):`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`return [ARXIV_URL.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]`
First commit 2015-12-27 19:35:55 +01:00			`else:`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`return ARXIV_URL.format(arxiv_id=arxiv_ids)`
First commit 2015-12-27 19:35:55 +01:00

			`def to_canonical(urls):`
			`"""`
			`Convert a list of DOIs URLs to a list of canonical DOIs.`

			`:param dois: A list of DOIs URLs.`
			`:returns: List of canonical DOIs.`
			`"""`
			`if isinstance(urls, list):`
			`return [extract_from_text(url) for url in urls]`
			`else:`
			`return extract_from_text(urls)`


Complete isbn API and fix a typo in arXiv API. 2015-12-27 23:55:57 +01:00			`def from_DOI(doi):`
First commit 2015-12-27 19:35:55 +01:00			`"""`
			`Get the arXiv eprint id for a given DOI.`

			`.. note::`

			`Uses arXiv API. Will not return anything if arXiv is not aware of the`
			`associated DOI.`

			`:param doi: The DOI of the resource to look for.`
			:returns: The arXiv eprint id, or ``None`` if not found.
			`"""`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`try:`
			`r = requests.get("http://export.arxiv.org/api/query",`
			`params={`
			`"search_query": "doi:%s" % (doi,),`
			`"max_results": 1`
			`})`
			`r.raise_for_status()`
			`except RequestException:`
			`return None`
First commit 2015-12-27 19:35:55 +01:00			`e = xml.etree.ElementTree.fromstring(r.content)`
			`for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):`
			`id = entry.find("{http://www.w3.org/2005/Atom}id").text`
			`# id is an arXiv full URL. We only want the id which is the last URL`
			`# component.`
			`return id.split("/")[-1]`
			`return None`


Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`def to_DOI(arxiv_id):`
First commit 2015-12-27 19:35:55 +01:00			`"""`
			`Get the associated DOI for a given arXiv eprint.`

			`.. note::`

			`Uses arXiv API. Will not return anything if arXiv is not aware of the`
			`associated DOI.`

			`:param eprint: The arXiv eprint id.`
			:returns: The DOI if any, or ``None``.
			`"""`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`try:`
			`r = requests.get("http://export.arxiv.org/api/query",`
			`params={`
			`"id_list": arxiv_id,`
			`"max_results": 1`
			`})`
			`r.raise_for_status()`
			`except RequestException:`
			`return None`
First commit 2015-12-27 19:35:55 +01:00			`e = xml.etree.ElementTree.fromstring(r.content)`
			`for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):`
			`doi = entry.find("{http://arxiv.org/schemas/atom}doi")`
			`if doi is not None:`
			`return doi.text`
			`return None`


			`def get_sources(arxiv_id):`
			`"""`
			`Download sources on arXiv for a given preprint.`

Add a disclaimer about bulk downloadin arXiv 2016-01-07 00:54:20 +01:00			`.. note::`

			`Bulk download of sources from arXiv is not permitted by their API. \`
			`You should have a look at http://arxiv.org/help/bulk_data_s3.`

First commit 2015-12-27 19:35:55 +01:00			:param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \
			`canonical form.`
			:returns: A ``TarFile`` object of the sources of the arXiv preprint or \
			``None``.
			`"""`
			`try:`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`r = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))`
			`r.raise_for_status()`
First commit 2015-12-27 19:35:55 +01:00			`file_object = io.BytesIO(r.content)`
			`return tarfile.open(fileobj=file_object)`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`except (RequestException, AssertionError, tarfile.TarError):`
First commit 2015-12-27 19:35:55 +01:00			`return None`


			`def get_bbl(arxiv_id):`
			`"""`
			`Get the .bbl files (if any) of a given preprint.`

Add a disclaimer about bulk downloadin arXiv 2016-01-07 00:54:20 +01:00			`.. note::`

			`Bulk download of sources from arXiv is not permitted by their API. \`
			`You should have a look at http://arxiv.org/help/bulk_data_s3.`

Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			:param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
			`a canonical form.`
First commit 2015-12-27 19:35:55 +01:00			:returns: A list of the full text of the ``.bbl`` files (if any) \
			or ``None``.
			`"""`
			`tf = get_sources(arxiv_id)`
			`bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]`
			`bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)`
			`for member in bbl_files]`
			`return bbl_files`


			`def get_citations(arxiv_id):`
			`"""`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`Get the DOIs cited by a given preprint.`

Add a disclaimer about bulk downloadin arXiv 2016-01-07 00:54:20 +01:00			`.. note::`

			`Bulk download of sources from arXiv is not permitted by their API. \`
			`You should have a look at http://arxiv.org/help/bulk_data_s3.`

Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			:param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
			`a canonical form.`
			`:returns: A dict of cleaned plaintext citations and their associated DOI.`
First commit 2015-12-27 19:35:55 +01:00			`"""`
Reimport bbl citations parsing and make some minor fixes 2015-12-27 23:46:43 +01:00			`dois = {}`
			`# Get the list of bbl files for this preprint`
			`bbl_files = get_bbl(arxiv_id)`
			`for bbl_file in bbl_files:`
			`# Fetch the cited DOIs for each of the bbl files`
			`dois.update(bbl.get_cited_DOIs(bbl_file))`
			`return dois`