arxiv_metadata/reference_fetcher/doi.py

"""
This file contains all the DOI-related functions.
"""
import requests

from . import regex
from . import tools


def extract_doi_links(urls):
    """
    Try to find a DOI from a given list of URLs.

    :param urls: A list of URLs.
    :returns: First matching DOI URL, or ``None``.
    """
    doi_urls = [url for url in urls if "/doi/" in url]
    if len(doi_urls) > 0:
        return ("http://dx.doi.org" +
                doi_urls[0][doi_urls[0].find("/doi/") + 4:])
    else:
        return None


def extract_arxiv_links(urls):
    """
    Try to find an arXiv link from a given list of URLs.

    :param urls: A list of URLs.
    :returns: First matching arXiv URL, or ``None``.
    """
    arxiv_urls = [url for url in urls if "://arxiv.org" in url]
    if len(arxiv_urls) > 0:
        return arxiv_urls[0]
    else:
        return None


def match_doi_or_arxiv(text, only=["DOI", "arXiv"]):
    """
    Search for a valid article ID (DOI or ArXiv) in the given text
    (regex-based).

    From \
    http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/ \
    and \
    https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb.

    :param text: Input text on which matching is to be done.
    :param only: List of matches to look for (DOI/arXiv).
    :returns: a tuple ``(type, first matching ID)`` or ``None`` if not found.
    """
    text = text.lower()
    # Try to extract DOI
    if "DOI" in only:
        extractID = regex.doi.search(text.replace('&#338;', '-'))
        if not extractID:
            # PNAS fix
            extractID = regex.doi_pnas.search(text.
                                              replace('pnas', '/pnas'))
            if not extractID:
                # JSB fix
                extractID = regex.doi_jsb.search(text)
        if extractID:
            # If DOI extracted, clean it and return it
            cleanDOI = False
            cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
            if regex.clean_doi.search(cleanDOI):
                cleanDOI = cleanDOI[1:]
            # FABSE J fix
            if regex.clean_doi_fabse.search(cleanDOI):
                cleanDOI = cleanDOI[:20]
            # Second JCB fix
            if regex.clean_doi_jcb.search(cleanDOI):
                cleanDOI = cleanDOI[:21]
            if len(cleanDOI) > 40:
                cleanDOItemp = regex.clean_doi_len.sub('000', cleanDOI)
                reps = {'.': 'A', '-': '0'}
                cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
                digitStart = 0
                for i in range(len(cleanDOItemp)):
                    if cleanDOItemp[i].isdigit():
                        digitStart = 1
                        if cleanDOItemp[i].isalpha() and digitStart:
                            break
                cleanDOI = cleanDOI[0:(8+i)]
            return ("DOI", cleanDOI)
    # Else, try to extract arXiv
    if "arXiv" in only:
        extractID = regex.arXiv.search(text)
        if extractID:
            return ("arXiv", extractID.group(1))
    return None


def get_oa_version(doi):
    """
    Get an OA version for a given DOI.

    :param doi: A DOI or a dx.doi.org link.
    :returns: The URL of the OA version of the given DOI, or ``None``.
    """
    # If DOI is a link, truncate it
    if "dx.doi.org" in doi:
        doi = doi[doi.find("dx.doi.org") + 11:]
    r = requests.get("http://beta.dissem.in/api/%s" % (doi,))
    oa_url = None
    if r.status_code == requests.codes.ok:
        result = r.json()
        if("status" in result and
           "paper" in result and
           result["status"] == "ok" and
           "pdf_url" in result["paper"]):
            oa_url = result["paper"]["pdf_url"]
    return oa_url
Add a function to look for an OA version for a given DOI 2015-12-23 23:03:40 +01:00			`"""`
			`This file contains all the DOI-related functions.`
			`"""`
			`import requests`

Move reference fetching code into a sub-library 2015-12-23 23:46:37 +01:00			`from . import regex`
			`from . import tools`
Clean a bit the code 2015-12-23 22:49:14 +01:00

			`def extract_doi_links(urls):`
			`"""`
			`Try to find a DOI from a given list of URLs.`
Update doc to rst format for generation with Sphinx 2015-12-25 01:43:49 +01:00
			`:param urls: A list of URLs.`
			:returns: First matching DOI URL, or ``None``.
Clean a bit the code 2015-12-23 22:49:14 +01:00			`"""`
			`doi_urls = [url for url in urls if "/doi/" in url]`
			`if len(doi_urls) > 0:`
			`return ("http://dx.doi.org" +`
			`doi_urls[0][doi_urls[0].find("/doi/") + 4:])`
			`else:`
			`return None`


			`def extract_arxiv_links(urls):`
			`"""`
			`Try to find an arXiv link from a given list of URLs.`
Update doc to rst format for generation with Sphinx 2015-12-25 01:43:49 +01:00
			`:param urls: A list of URLs.`
			:returns: First matching arXiv URL, or ``None``.
Clean a bit the code 2015-12-23 22:49:14 +01:00			`"""`
			`arxiv_urls = [url for url in urls if "://arxiv.org" in url]`
			`if len(arxiv_urls) > 0:`
			`return arxiv_urls[0]`
			`else:`
			`return None`


			`def match_doi_or_arxiv(text, only=["DOI", "arXiv"]):`
			`"""`
			`Search for a valid article ID (DOI or ArXiv) in the given text`
			`(regex-based).`

Update doc to rst format for generation with Sphinx 2015-12-25 01:43:49 +01:00			`From \`
			`http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/ \`
			`and \`
			`https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb.`

			`:param text: Input text on which matching is to be done.`
			`:param only: List of matches to look for (DOI/arXiv).`
			:returns: a tuple ``(type, first matching ID)`` or ``None`` if not found.
Clean a bit the code 2015-12-23 22:49:14 +01:00			`"""`
			`text = text.lower()`
			`# Try to extract DOI`
			`if "DOI" in only:`
			`extractID = regex.doi.search(text.replace('Œ', '-'))`
			`if not extractID:`
			`# PNAS fix`
			`extractID = regex.doi_pnas.search(text.`
			`replace('pnas', '/pnas'))`
			`if not extractID:`
			`# JSB fix`
			`extractID = regex.doi_jsb.search(text)`
			`if extractID:`
			`# If DOI extracted, clean it and return it`
			`cleanDOI = False`
			`cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')`
			`if regex.clean_doi.search(cleanDOI):`
			`cleanDOI = cleanDOI[1:]`
			`# FABSE J fix`
			`if regex.clean_doi_fabse.search(cleanDOI):`
			`cleanDOI = cleanDOI[:20]`
			`# Second JCB fix`
			`if regex.clean_doi_jcb.search(cleanDOI):`
			`cleanDOI = cleanDOI[:21]`
			`if len(cleanDOI) > 40:`
			`cleanDOItemp = regex.clean_doi_len.sub('000', cleanDOI)`
			`reps = {'.': 'A', '-': '0'}`
			`cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)`
			`digitStart = 0`
			`for i in range(len(cleanDOItemp)):`
			`if cleanDOItemp[i].isdigit():`
			`digitStart = 1`
			`if cleanDOItemp[i].isalpha() and digitStart:`
			`break`
			`cleanDOI = cleanDOI[0:(8+i)]`
			`return ("DOI", cleanDOI)`
			`# Else, try to extract arXiv`
			`if "arXiv" in only:`
			`extractID = regex.arXiv.search(text)`
			`if extractID:`
			`return ("arXiv", extractID.group(1))`
			`return None`
Add a function to look for an OA version for a given DOI 2015-12-23 23:03:40 +01:00

			`def get_oa_version(doi):`
			`"""`
			`Get an OA version for a given DOI.`

Update doc to rst format for generation with Sphinx 2015-12-25 01:43:49 +01:00			`:param doi: A DOI or a dx.doi.org link.`
			:returns: The URL of the OA version of the given DOI, or ``None``.
Add a function to look for an OA version for a given DOI 2015-12-23 23:03:40 +01:00			`"""`
			`# If DOI is a link, truncate it`
			`if "dx.doi.org" in doi:`
			`doi = doi[doi.find("dx.doi.org") + 11:]`
			`r = requests.get("http://beta.dissem.in/api/%s" % (doi,))`
			`oa_url = None`
			`if r.status_code == requests.codes.ok:`
			`result = r.json()`
			`if("status" in result and`
			`"paper" in result and`
			`result["status"] == "ok" and`
			`"pdf_url" in result["paper"]):`
			`oa_url = result["paper"]["pdf_url"]`
			`return oa_url`