libbmc/libbmc/doi.py

121 lines
3.0 KiB
Python

"""
This file contains all the DOI-related functions.
"""
import re
import requests
from requests.exception import RequestException
from libbmc import tools
# Taken from
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b",
re.IGNORECASE)
# Base dx.doi.org URL for redirections
DX_URL = "http://dx.doi.org/{doi}"
def is_valid(doi):
"""
Check that a given DOI is a valid canonical DOI.
:param doi: The DOI to be checked.
:returns: Boolean indicating whether the DOI is valid or not.
"""
match = REGEX.match(doi)
return ((match is not None) and (match.group(0) == doi))
def extract_from_text(text):
"""
Extract canonical DOIs from a text.
:param text: The text to extract DOIs from.
:returns: A list of found DOIs.
"""
return tools.remove_duplicates(REGEX.findall(text))
def to_URL(dois):
"""
Convert a list of canonical DOIs to a list of DOIs URLs.
:param dois: List of canonical DOIs.
:returns: A list of DOIs URLs.
"""
if isinstance(dois, list):
return [DX_URL.format(doi=doi) for doi in dois]
else:
return DX_URL.format(doi=dois)
def to_canonical(urls):
"""
Convert a list of DOIs URLs to a list of canonical DOIs.
:param dois: A list of DOIs URLs.
:returns: List of canonical DOIs.
"""
if isinstance(urls, list):
return [extract_from_text(url) for url in urls]
else:
return extract_from_text(urls)
def get_oa_version(doi):
"""
Get an OA version for a given DOI.
.. note::
Uses beta.dissem.in API.
:param doi: A canonical DOI.
:returns: The URL of the OA version of the given DOI, or ``None``.
"""
# If DOI is a link, truncate it
try:
r = requests.get("http://beta.dissem.in/api/%s" % (doi,))
r.raise_for_status()
result = r.json()
assert(result["status"] == "ok")
return result["paper"]["pdf_url"]
except (AssertionError, ValueError, KeyError, RequestException):
return None
def get_linked_version(doi):
"""
Get the original link behind the DOI.
:param doi: A canonical DOI.
:returns: The canonical URL behind the DOI, or ``None``.
"""
try:
r = requests.head(to_URL(doi))
return r.headers.get("location")
except RequestException:
return None
def get_bibtex(doi):
"""
Get a BibTeX entry for a given DOI.
.. note::
Adapted from https://gist.github.com/jrsmith3/5513926.
:param doi: The canonical DOI to get BibTeX from.
:returns: A BibTeX string or ``None``.
"""
try:
r = requests.get(to_URL(doi),
headers={"accept": "application/x-bibtex"})
r.raise_for_status()
assert(r.headers.get("content-type") == "application/x-bibtex")
return r.text
except (RequestException, AssertionError):
return None