2015-12-27 19:35:55 +01:00
"""
This file contains all the DOI - related functions .
"""
import re
import requests
2016-01-10 18:35:08 +01:00
from requests . exceptions import RequestException
2015-12-27 23:46:43 +01:00
2016-02-01 17:32:24 +01:00
from libbmc import __valid_identifiers__
2015-12-27 19:35:55 +01:00
from libbmc import tools
2016-02-01 17:32:24 +01:00
# Append DOI to the valid identifiers list
__valid_identifiers__ + = [ " doi " ]
2015-12-27 19:35:55 +01:00
# Taken from
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
2016-01-20 22:35:43 +01:00
REGEX = re . compile ( r " \ b(10[.][0-9] { 4,}(?:[.][0-9]+)*/(?:(?![ \" & \ ' ]) \ S)+) \ b " ,
2015-12-27 19:35:55 +01:00
re . IGNORECASE )
# Base dx.doi.org URL for redirections
2015-12-27 23:46:43 +01:00
DX_URL = " http://dx.doi.org/ {doi} "
2015-12-27 19:35:55 +01:00
2016-02-11 15:17:32 +01:00
# Base DISSEMIN API
DISSEMIN_API = " http://beta.dissem.in/api/ "
2015-12-27 19:35:55 +01:00
def is_valid ( doi ) :
"""
Check that a given DOI is a valid canonical DOI .
: param doi : The DOI to be checked .
: returns : Boolean indicating whether the DOI is valid or not .
2016-01-20 22:35:43 +01:00
>> > is_valid ( ' 10.1209/0295-5075/111/40005 ' )
True
>> > is_valid ( ' 10.1016.12.31/nature.S0735-1097(98)2000/12/31/34:7-7 ' )
True
>> > is_valid ( ' 10.1002/(SICI)1522-2594(199911)42:5<952::AID-MRM16>3.0.CO;2-S ' )
True
>> > is_valid ( ' 10.1007/978-3-642-28108-2_19 ' )
True
>> > is_valid ( ' 10.1007.10/978-3-642-28108-2_19 ' )
True
>> > is_valid ( ' 10.1016/S0735-1097(98)00347-7 ' )
True
>> > is_valid ( ' 10.1579/0044-7447(2006)35 \ [89:RDUICP \ ]2.0.CO;2 ' )
True
>> > is_valid ( ' <geo coords= " 10.4515260,51.1656910 " ></geo> ' )
False
2015-12-27 19:35:55 +01:00
"""
2015-12-27 23:46:43 +01:00
match = REGEX . match ( doi )
2016-02-17 16:04:36 +01:00
return ( match is not None ) and ( match . group ( 0 ) == doi )
2015-12-27 19:35:55 +01:00
def extract_from_text ( text ) :
"""
Extract canonical DOIs from a text .
: param text : The text to extract DOIs from .
: returns : A list of found DOIs .
2016-01-20 22:35:43 +01:00
>> > sorted ( extract_from_text ( ' 10.1209/0295-5075/111/40005 10.1016.12.31/nature.S0735-1097(98)2000/12/31/34:7-7 10.1002/(SICI)1522-2594(199911)42:5<952::AID-MRM16>3.0.CO;2-S 10.1007/978-3-642-28108-2_19 10.1007.10/978-3-642-28108-2_19 10.1016/S0735-1097(98)00347-7 10.1579/0044-7447(2006)35 \ [89:RDUICP \ ]2.0.CO;2 <geo coords= " 10.4515260,51.1656910 " ></geo> ' ) )
[ ' 10.1002/(SICI)1522-2594(199911)42:5<952::AID-MRM16>3.0.CO;2-S ' , ' 10.1007.10/978-3-642-28108-2_19 ' , ' 10.1007/978-3-642-28108-2_19 ' , ' 10.1016.12.31/nature.S0735-1097(98)2000/12/31/34:7-7 ' , ' 10.1016/S0735-1097(98)00347-7 ' , ' 10.1209/0295-5075/111/40005 ' , ' 10.1579/0044-7447(2006)35 \\ \\ [89:RDUICP \\ \\ ]2.0.CO;2 ' ]
2015-12-27 19:35:55 +01:00
"""
2015-12-27 23:46:43 +01:00
return tools . remove_duplicates ( REGEX . findall ( text ) )
2015-12-27 19:35:55 +01:00
2016-02-17 16:04:36 +01:00
def to_url ( dois ) :
2015-12-27 19:35:55 +01:00
"""
Convert a list of canonical DOIs to a list of DOIs URLs .
2016-01-20 22:35:43 +01:00
: param dois : List of canonical DOIs . Can also be a single canonical DOI .
: returns : A list of DOIs URLs ( resp . a single value ) .
2016-02-17 16:04:36 +01:00
>> > to_url ( [ ' 10.1209/0295-5075/111/40005 ' ] )
2016-01-20 22:35:43 +01:00
[ ' http://dx.doi.org/10.1209/0295-5075/111/40005 ' ]
2016-02-17 16:04:36 +01:00
>> > to_url ( ' 10.1209/0295-5075/111/40005 ' )
2016-01-20 22:35:43 +01:00
' http://dx.doi.org/10.1209/0295-5075/111/40005 '
2015-12-27 19:35:55 +01:00
"""
if isinstance ( dois , list ) :
2015-12-27 23:46:43 +01:00
return [ DX_URL . format ( doi = doi ) for doi in dois ]
2015-12-27 19:35:55 +01:00
else :
2015-12-27 23:46:43 +01:00
return DX_URL . format ( doi = dois )
2015-12-27 19:35:55 +01:00
def to_canonical ( urls ) :
"""
Convert a list of DOIs URLs to a list of canonical DOIs .
2016-01-20 22:35:43 +01:00
: param dois : A list of DOIs URLs . Can also be a single DOI URL .
: returns : List of canonical DOIs ( resp . a single value ) . ` ` None ` ` if an \
error occurred .
>> > to_canonical ( [ ' http://dx.doi.org/10.1209/0295-5075/111/40005 ' ] )
[ ' 10.1209/0295-5075/111/40005 ' ]
>> > to_canonical ( ' http://dx.doi.org/10.1209/0295-5075/111/40005 ' )
' 10.1209/0295-5075/111/40005 '
>> > to_canonical ( ' aaaa ' ) is None
True
>> > to_canonical ( [ ' aaaa ' ] ) is None
True
2015-12-27 19:35:55 +01:00
"""
2016-02-17 16:04:36 +01:00
return tools . map_or_apply ( extract_from_text , urls )
2015-12-27 19:35:55 +01:00
def get_oa_version ( doi ) :
"""
Get an OA version for a given DOI .
. . note : :
Uses beta . dissem . in API .
: param doi : A canonical DOI .
: returns : The URL of the OA version of the given DOI , or ` ` None ` ` .
2016-01-20 22:35:43 +01:00
>> > get_oa_version ( ' 10.1209/0295-5075/111/40005 ' )
' http://arxiv.org/abs/1506.06690 '
2015-12-27 19:35:55 +01:00
"""
try :
2016-02-17 16:04:36 +01:00
request = requests . get ( " %s %s " % ( DISSEMIN_API , doi ) )
request . raise_for_status ( )
result = request . json ( )
assert result [ " status " ] == " ok "
2015-12-27 19:35:55 +01:00
return result [ " paper " ] [ " pdf_url " ]
2015-12-27 23:46:43 +01:00
except ( AssertionError , ValueError , KeyError , RequestException ) :
2015-12-27 19:35:55 +01:00
return None
2016-02-11 15:17:32 +01:00
def get_oa_policy ( doi ) :
"""
Get OA policy for a given DOI .
. . note : :
Uses beta . dissem . in API .
: param doi : A canonical DOI .
: returns : The OpenAccess policy for the associated publications , or \
` ` None ` ` if unknown .
>> > tmp = get_oa_policy ( ' 10.1209/0295-5075/111/40005 ' ) ; ( tmp [ " published " ] , tmp [ " preprint " ] , tmp [ " postprint " ] , tmp [ " romeo_id " ] )
( ' can ' , ' can ' , ' can ' , ' 1896 ' )
2016-02-11 15:32:05 +01:00
>> > get_oa_policy ( ' 10.1215/9780822387268 ' ) is None
True
2016-02-11 15:17:32 +01:00
"""
try :
2016-02-17 16:04:36 +01:00
request = requests . get ( " %s %s " % ( DISSEMIN_API , doi ) )
request . raise_for_status ( )
result = request . json ( )
assert result [ " status " ] == " ok "
2016-02-11 15:17:32 +01:00
return ( [ i
for i in result [ " paper " ] [ " publications " ]
if i [ " doi " ] == doi ] [ 0 ] ) [ " policy " ]
2016-02-11 15:32:05 +01:00
except ( AssertionError , ValueError ,
KeyError , RequestException , IndexError ) :
2016-02-11 15:17:32 +01:00
return None
2015-12-27 19:35:55 +01:00
def get_linked_version ( doi ) :
"""
Get the original link behind the DOI .
: param doi : A canonical DOI .
: returns : The canonical URL behind the DOI , or ` ` None ` ` .
2016-01-20 22:35:43 +01:00
>> > get_linked_version ( ' 10.1209/0295-5075/111/40005 ' )
' http://stacks.iop.org/0295-5075/111/i=4/a=40005?key=crossref.9ad851948a976ecdf216d4929b0b6f01 '
2015-12-27 19:35:55 +01:00
"""
2015-12-27 23:46:43 +01:00
try :
2016-02-17 16:04:36 +01:00
request = requests . head ( to_url ( doi ) )
return request . headers . get ( " location " )
2015-12-27 23:46:43 +01:00
except RequestException :
return None
2015-12-27 19:35:55 +01:00
def get_bibtex ( doi ) :
"""
Get a BibTeX entry for a given DOI .
. . note : :
Adapted from https : / / gist . github . com / jrsmith3 / 5513926.
: param doi : The canonical DOI to get BibTeX from .
: returns : A BibTeX string or ` ` None ` ` .
2016-01-20 22:35:43 +01:00
>> > get_bibtex ( ' 10.1209/0295-5075/111/40005 ' )
' @article { Verney_2015, \\ n \\ tdoi = { 10.1209/0295-5075/111/40005}, \\ n \\ turl = { http://dx.doi.org/10.1209/0295-5075/111/40005}, \\ n \\ tyear = 2015, \\ n \\ tmonth = {aug} , \\ n \\ tpublisher = {{ IOP} Publishing}, \\ n \\ tvolume = {111} , \\ n \\ tnumber = {4} , \\ n \\ tpages = {40005} , \\ n \\ tauthor = { Lucas Verney and Lev Pitaevskii and Sandro Stringari}, \\ n \\ ttitle = { Hybridization of first and second sound in a weakly interacting Bose gas}, \\ n \\ tjournal = {{ EPL}} \\ n} '
2015-12-27 19:35:55 +01:00
"""
2015-12-27 23:46:43 +01:00
try :
2016-02-17 16:04:36 +01:00
request = requests . get ( to_url ( doi ) ,
headers = { " accept " : " application/x-bibtex " } )
request . raise_for_status ( )
assert request . headers . get ( " content-type " ) == " application/x-bibtex "
return request . text
2015-12-27 23:46:43 +01:00
except ( RequestException , AssertionError ) :
2015-12-27 19:35:55 +01:00
return None