From 609fa6ce4f93269fc007d6fa59ecca5d49b321a6 Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Wed, 20 Jan 2016 22:35:43 +0100 Subject: [PATCH] doi.py fetcher.py unittests --- libbmc/doi.py | 76 +++++++++++++++++++++++++++++++++++++++++------ libbmc/fetcher.py | 2 ++ 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/libbmc/doi.py b/libbmc/doi.py index f976489..925231d 100644 --- a/libbmc/doi.py +++ b/libbmc/doi.py @@ -10,7 +10,7 @@ from libbmc import tools # Taken from # https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802 -REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b", +REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b", re.IGNORECASE) # Base dx.doi.org URL for redirections DX_URL = "http://dx.doi.org/{doi}" @@ -22,6 +22,30 @@ def is_valid(doi): :param doi: The DOI to be checked. :returns: Boolean indicating whether the DOI is valid or not. + + >>> is_valid('10.1209/0295-5075/111/40005') + True + + >>> is_valid('10.1016.12.31/nature.S0735-1097(98)2000/12/31/34:7-7') + True + + >>> is_valid('10.1002/(SICI)1522-2594(199911)42:5<952::AID-MRM16>3.0.CO;2-S') + True + + >>> is_valid('10.1007/978-3-642-28108-2_19') + True + + >>> is_valid('10.1007.10/978-3-642-28108-2_19') + True + + >>> is_valid('10.1016/S0735-1097(98)00347-7') + True + + >>> is_valid('10.1579/0044-7447(2006)35\[89:RDUICP\]2.0.CO;2') + True + + >>> is_valid('') + False """ match = REGEX.match(doi) return ((match is not None) and (match.group(0) == doi)) @@ -33,6 +57,9 @@ def extract_from_text(text): :param text: The text to extract DOIs from. :returns: A list of found DOIs. + + >>> sorted(extract_from_text('10.1209/0295-5075/111/40005 10.1016.12.31/nature.S0735-1097(98)2000/12/31/34:7-7 10.1002/(SICI)1522-2594(199911)42:5<952::AID-MRM16>3.0.CO;2-S 10.1007/978-3-642-28108-2_19 10.1007.10/978-3-642-28108-2_19 10.1016/S0735-1097(98)00347-7 10.1579/0044-7447(2006)35\[89:RDUICP\]2.0.CO;2 ')) + ['10.1002/(SICI)1522-2594(199911)42:5<952::AID-MRM16>3.0.CO;2-S', '10.1007.10/978-3-642-28108-2_19', '10.1007/978-3-642-28108-2_19', '10.1016.12.31/nature.S0735-1097(98)2000/12/31/34:7-7', '10.1016/S0735-1097(98)00347-7', '10.1209/0295-5075/111/40005', '10.1579/0044-7447(2006)35\\\\[89:RDUICP\\\\]2.0.CO;2'] """ return tools.remove_duplicates(REGEX.findall(text)) @@ -41,8 +68,14 @@ def to_URL(dois): """ Convert a list of canonical DOIs to a list of DOIs URLs. - :param dois: List of canonical DOIs. - :returns: A list of DOIs URLs. + :param dois: List of canonical DOIs. Can also be a single canonical DOI. + :returns: A list of DOIs URLs (resp. a single value). + + >>> to_URL(['10.1209/0295-5075/111/40005']) + ['http://dx.doi.org/10.1209/0295-5075/111/40005'] + + >>> to_URL('10.1209/0295-5075/111/40005') + 'http://dx.doi.org/10.1209/0295-5075/111/40005' """ if isinstance(dois, list): return [DX_URL.format(doi=doi) for doi in dois] @@ -54,13 +87,29 @@ def to_canonical(urls): """ Convert a list of DOIs URLs to a list of canonical DOIs. - :param dois: A list of DOIs URLs. - :returns: List of canonical DOIs. + :param dois: A list of DOIs URLs. Can also be a single DOI URL. + :returns: List of canonical DOIs (resp. a single value). ``None`` if an \ + error occurred. + + >>> to_canonical(['http://dx.doi.org/10.1209/0295-5075/111/40005']) + ['10.1209/0295-5075/111/40005'] + + >>> to_canonical('http://dx.doi.org/10.1209/0295-5075/111/40005') + '10.1209/0295-5075/111/40005' + + >>> to_canonical('aaaa') is None + True + + >>> to_canonical(['aaaa']) is None + True """ - if isinstance(urls, list): - return [extract_from_text(url) for url in urls] - else: - return extract_from_text(urls) + try: + if isinstance(urls, list): + return [extract_from_text(url)[0] for url in urls] + else: + return extract_from_text(urls)[0] + except IndexError: + return None def get_oa_version(doi): @@ -73,6 +122,9 @@ def get_oa_version(doi): :param doi: A canonical DOI. :returns: The URL of the OA version of the given DOI, or ``None``. + + >>> get_oa_version('10.1209/0295-5075/111/40005') + 'http://arxiv.org/abs/1506.06690' """ # If DOI is a link, truncate it try: @@ -91,6 +143,9 @@ def get_linked_version(doi): :param doi: A canonical DOI. :returns: The canonical URL behind the DOI, or ``None``. + + >>> get_linked_version('10.1209/0295-5075/111/40005') + 'http://stacks.iop.org/0295-5075/111/i=4/a=40005?key=crossref.9ad851948a976ecdf216d4929b0b6f01' """ try: r = requests.head(to_URL(doi)) @@ -109,6 +164,9 @@ def get_bibtex(doi): :param doi: The canonical DOI to get BibTeX from. :returns: A BibTeX string or ``None``. + + >>> get_bibtex('10.1209/0295-5075/111/40005') + '@article{Verney_2015,\\n\\tdoi = {10.1209/0295-5075/111/40005},\\n\\turl = {http://dx.doi.org/10.1209/0295-5075/111/40005},\\n\\tyear = 2015,\\n\\tmonth = {aug},\\n\\tpublisher = {{IOP} Publishing},\\n\\tvolume = {111},\\n\\tnumber = {4},\\n\\tpages = {40005},\\n\\tauthor = {Lucas Verney and Lev Pitaevskii and Sandro Stringari},\\n\\ttitle = {Hybridization of first and second sound in a weakly interacting Bose gas},\\n\\tjournal = {{EPL}}\\n}' """ try: r = requests.get(to_URL(doi), diff --git a/libbmc/fetcher.py b/libbmc/fetcher.py index cd3acca..e6ff516 100644 --- a/libbmc/fetcher.py +++ b/libbmc/fetcher.py @@ -25,6 +25,8 @@ def download(url, proxies=[None]): :returns: A tuple of the raw content of the downloaded data and its \ associated content-type. Returns ``(None, None)`` if it was \ unable to download the document. + + # TODO: Unittests """ # Loop over all available connections for proxy in proxies: