bmc/libbmc/fetcher.py

# -*- coding: utf8 -*-
# -----------------------------------------------------------------------------
# "THE NO-ALCOHOL BEER-WARE LICENSE" (Revision 42):
# Phyks (webmaster@phyks.me) wrote this file. As long as you retain this notice
# you can do whatever you want with this stuff (and you can also do whatever
# you want with this stuff without retaining it, but that's not cool...). If we
# meet some day, and you think this stuff is worth it, you can buy me a
# <del>beer</del> soda in return.
#                                                                   Phyks
# -----------------------------------------------------------------------------


import isbnlib
import re
import socket
import socks
import subprocess
import sys
try:
    # For Python 3.0 and later
    from urllib.request import urlopen, Request
    from urllib.error import URLError
except ImportError:
    # Fall back to Python 2's urllib2
    from urllib2 import urlopen, Request, URLError
import arxiv2bib as arxiv_metadata
import libbmc.tools as tools
import bibtexparser
from libbmc.config import Config


config = Config()
default_socket = socket.socket
try:
    stdout_encoding = sys.stdout.encoding
    assert(stdout_encoding is not None)
except (AttributeError, AssertionError):
    stdout_encoding = 'UTF-8'


def download(url):
    """Download url tofile

    Check that it is a valid pdf or djvu file. Tries all the
    available proxies sequentially. Returns the raw content of the file, or
    false if it could not be downloaded.
    """
    for proxy in config.get("proxies"):
        if proxy.startswith('socks'):
            if proxy[5] == '4':
                proxy_type = socks.SOCKS4
            else:
                proxy_type = socks.SOCKS5
            proxy = proxy[proxy.find('://')+3:]
            try:
                proxy, port = proxy.split(':')
            except ValueError:
                port = None
            socks.set_default_proxy(proxy_type, proxy, port)
            socket.socket = socks.socksocket
        elif proxy == '':
            socket.socket = default_socket
        else:
            try:
                proxy, port = proxy.split(':')
            except ValueError:
                port = None
            socks.set_default_proxy(socks.HTTP, proxy, port)
            socket.socket = socks.socksocket
        try:
            r = urlopen(url)
            try:
                size = int(dict(r.info())['content-length'].strip())
            except KeyError:
                try:
                    size = int(dict(r.info())['Content-Length'].strip())
                except KeyError:
                    size = 0
            dl = b""
            dl_size = 0
            while True:
                buf = r.read(1024)
                if buf:
                    dl += buf
                    dl_size += len(buf)
                    if size != 0:
                        done = int(50 * dl_size / size)
                        sys.stdout.write("\r[%s%s]" % ('='*done, ' '*(50-done)))
                        sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
                        sys.stdout.flush()
                else:
                    break
            contenttype = False
            contenttype_req = None
            try:
                contenttype_req = dict(r.info())['content-type']
            except KeyError:
                try:
                    contenttype_req = dict(r.info())['Content-Type']
                except KeyError:
                    continue
            try:
                if 'pdf' in contenttype_req:
                    contenttype = 'pdf'
                elif 'djvu' in contenttype_req:
                    contenttype = 'djvu'
            except KeyError:
                pass

            if r.getcode() != 200 or contenttype is False:
                continue

            return dl, contenttype
        except ValueError:
            tools.warning("Invalid URL")
            return False, None
        except (URLError, socket.error):
            if proxy != "":
                proxy_txt = "using proxy "+proxy
            else:
                proxy_txt = "without using any proxy"
            tools.warning("Unable to get "+url+" "+proxy_txt+". It " +
                          "may not be available at the moment.")
            continue
    return False, None


isbn_re = re.compile(r'isbn[\s]?:?[\s]?((?:[0-9]{3}[ -]?)?[0-9]{1,5}[ -]?[0-9]{1,7}[ -]?[0-9]{1,6}[- ]?[0-9])',
                     re.IGNORECASE)


def findISBN(src):
    """Search for a valid ISBN in src.

    Returns the ISBN or false if not found or an error occurred."""
    if src.endswith(".pdf"):
        totext = subprocess.Popen(["pdftotext", src, "-"],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE,
                                  bufsize=1)
    elif src.endswith(".djvu"):
        totext = subprocess.Popen(["djvutxt", src],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE,
                                  bufsize=1)
    else:
        return False

    while totext.poll() is None:
        extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
        extractISBN = isbn_re.search(extractfull.lower().replace('&#338;',
                                                                 '-'))
        if extractISBN:
            totext.terminate()
            break

    err = totext.communicate()[1]
    if totext.returncode > 0:
        # Error happened
        tools.warning(err)
        return False

    cleanISBN = False
    # Clean ISBN is the ISBN number without separators
    if extractISBN:
        cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')
    return cleanISBN


def isbn2Bib(isbn):
    """Tries to get bibtex entry from an ISBN number"""
    # Default merges results from worldcat.org and google books
    try:
        return isbnlib.registry.bibformatters['bibtex'](isbnlib.meta(isbn,
                                                                     'default'))
    except (isbnlib.ISBNLibException, isbnlib.ISBNToolsException, TypeError):
        return ''


doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
clean_doi_re = re.compile('^/')
clean_doi_fabse_re = re.compile('^10.1096')
clean_doi_jcb_re = re.compile('^10.1083')
clean_doi_len_re = re.compile(r'\d\.\d')
arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)


def findArticleID(src, only=["DOI", "arXiv"]):
    """Search for a valid article ID (DOI or ArXiv) in src.

    Returns a tuple (type, first matching ID) or False if not found
    or an error occurred.
    From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
    and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
    """
    if src.endswith(".pdf"):
        totext = subprocess.Popen(["pdftotext", src, "-"],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
    elif src.endswith(".djvu"):
        totext = subprocess.Popen(["djvutxt", src],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
    else:
        return (False, False)

    extractfull = ''
    extract_type = False
    extractID = None
    while totext.poll() is None:
        extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
        # Try to extract DOI
        if "DOI" in only:
            extractID = doi_re.search(extractfull.lower().replace('&#338;', '-'))
            if not extractID:
                # PNAS fix
                extractID = doi_pnas_re.search(extractfull.
                                               lower().
                                               replace('pnas', '/pnas'))
                if not extractID:
                    # JSB fix
                    extractID = doi_jsb_re.search(extractfull.lower())
            if extractID:
                extract_type = "DOI"
                totext.terminate()
        # Try to extract arXiv
        if "arXiv" in only:
            tmp_extractID = arXiv_re.search(extractfull)
            if tmp_extractID:
                if not extractID or extractID.start(0) > tmp_extractID.start(1):
                    # Only use arXiv id if it is before the DOI in the pdf
                    extractID = tmp_extractID
                    extract_type = "arXiv"
                    totext.terminate()
        if extract_type is not False:
            break

    err = totext.communicate()[1]
    if totext.returncode > 0:
        # Error happened
        tools.warning(err)
        return (False, False)

    if extractID is not None and extract_type == "DOI":
        # If DOI extracted, clean it and return it
        cleanDOI = False
        cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')
        if clean_doi_re.search(cleanDOI):
            cleanDOI = cleanDOI[1:]
        # FABSE J fix
        if clean_doi_fabse_re.search(cleanDOI):
            cleanDOI = cleanDOI[:20]
        # Second JCB fix
        if clean_doi_jcb_re.search(cleanDOI):
            cleanDOI = cleanDOI[:21]
        if len(cleanDOI) > 40:
            cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI)
            reps = {'.': 'A', '-': '0'}
            cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
            digitStart = 0
            for i in range(len(cleanDOItemp)):
                if cleanDOItemp[i].isdigit():
                    digitStart = 1
                    if cleanDOItemp[i].isalpha() and digitStart:
                        break
            cleanDOI = cleanDOI[0:(8+i)]
        return ("DOI", cleanDOI)
    elif extractID is not None and extract_type == "arXiv":
        # If arXiv id is extracted, return it
        return ("arXiv", extractID.group(1))
    return (False, False)


def doi2Bib(doi):
    """Returns a bibTeX string of metadata for a given DOI.

    From : https://gist.github.com/jrsmith3/5513926
    """
    url = "http://dx.doi.org/" + doi
    headers = {"accept": "application/x-bibtex"}
    req = Request(url, headers=headers)
    try:
        r = urlopen(req)

        try:
            if dict(r.info())['content-type'] == 'application/x-bibtex':
                return r.read().decode('utf-8')
            else:
                return ''
        except KeyError:
            try:
                if dict(r.info())['Content-Type'] == 'application/x-bibtex':
                    return r.read().decode('utf-8')
                else:
                    return ''
            except KeyError:
                return ''
    except URLError:
        tools.warning('Unable to contact remote server to get the bibtex ' +
                      'entry for doi '+doi)
        return ''


def arXiv2Bib(arxiv):
    """Returns bibTeX string of metadata for a given arXiv id

    arxiv is an arxiv id
    """
    bibtex = arxiv_metadata.arxiv2bib([arxiv])
    for bib in bibtex:
        if isinstance(bib, arxiv_metadata.ReferenceErrorInfo):
            continue
        else:
            fetched_bibtex = bibtexparser.loads(bib.bibtex())
            fetched_bibtex = fetched_bibtex.entries_dict
            fetched_bibtex = fetched_bibtex[list(fetched_bibtex.keys())[0]]
            try:
                del(fetched_bibtex['file'])
            except KeyError:
                pass
            return tools.parsed2Bibtex(fetched_bibtex)
    return ''


HAL_re = re.compile(r'(hal-\d{8}), version (\d+)')


def findHALId(src):
    """Searches for a valid HAL id in src

    Returns a tuple of the HAL id and the version
    or False if not found or an error occurred.
    """
    if src.endswith(".pdf"):
        totext = subprocess.Popen(["pdftotext", src, "-"],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
    elif src.endswith(".djvu"):
        totext = subprocess.Popen(["djvutxt", src],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
    else:
        return False

    while totext.poll() is None:
        extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])
        extractID = HAL_re.search(extractfull)
        if extractID:
            totext.terminate()
            break

    err = totext.communicate()[1]
    if totext.returncode > 0:
        # Error happened
        tools.warning(err)
        return False
    else:
        return extractID.group(1), extractID.group(2)
License + Consolidating fetcher.py 2014-05-26 16:12:21 +02:00			`# -- coding: utf8 --`
			`# -----------------------------------------------------------------------------`
			`# "THE NO-ALCOHOL BEER-WARE LICENSE" (Revision 42):`
			`# Phyks (webmaster@phyks.me) wrote this file. As long as you retain this notice`
			`# you can do whatever you want with this stuff (and you can also do whatever`
			`# you want with this stuff without retaining it, but that's not cool...). If we`
			`# meet some day, and you think this stuff is worth it, you can buy me a`
			`# <del>beer</del> soda in return.`
			`# Phyks`
			`# -----------------------------------------------------------------------------`

Download of papers working You should pass the url of the pdf file to the script, along with the `download` parameter. It will try the proxies in the `params.py` file, until it finds one that allow him to get the pdf file. TODO : Use pdfparanoia to remove watermarks 2014-04-26 11:52:19 +02:00
Updated for new version of isbnlib 2014-07-08 21:37:13 +02:00			`import isbnlib`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`import re`
Rewrite to use PySocks 2014-08-02 23:34:34 +02:00			`import socket`
			`import socks`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`import subprocess`
Progress bar for download 2014-05-14 23:07:06 +02:00			`import sys`
Rewrite to use PySocks 2014-08-02 23:34:34 +02:00			`try:`
			`# For Python 3.0 and later`
Flake8 fixes 2014-08-03 00:17:01 +02:00			`from urllib.request import urlopen, Request`
Rewrite to use PySocks 2014-08-02 23:34:34 +02:00			`from urllib.error import URLError`
			`except ImportError:`
			`# Fall back to Python 2's urllib2`
Flake8 fixes 2014-08-03 00:17:01 +02:00			`from urllib2 import urlopen, Request, URLError`
Functions to handle arXiv metadata 2014-05-02 00:07:49 +02:00			`import arxiv2bib as arxiv_metadata`
Flake8 fixes 2014-08-03 00:17:01 +02:00			`import libbmc.tools as tools`
Fix issue #21 2014-12-03 12:54:24 +01:00			`import bibtexparser`
Fix imports + subparsers in python3 2014-08-03 00:09:07 +02:00			`from libbmc.config import Config`
handle ieee xplore login.jsp urls 2013-01-22 02:11:12 +01:00
No fail if proxy is not accessible 2014-04-26 18:43:25 +02:00
Custom masks support 2014-06-30 23:02:30 +02:00			`config = Config()`
Fix imports + subparsers in python3 2014-08-03 00:09:07 +02:00			`default_socket = socket.socket`
Fix sys.stdout.encoding error 2014-08-03 21:37:34 +02:00			`try:`
			`stdout_encoding = sys.stdout.encoding`
			`assert(stdout_encoding is not None)`
			`except (AttributeError, AssertionError):`
Further bugfixes for python3 2014-08-03 21:20:48 +02:00			`stdout_encoding = 'UTF-8'`
Custom masks support 2014-06-30 23:02:30 +02:00

Beginning of refactor 2014-04-28 22:23:05 +02:00			`def download(url):`
			`"""Download url tofile`
No fail if proxy is not accessible 2014-04-26 18:43:25 +02:00
Beginning of refactor 2014-04-28 22:23:05 +02:00			`Check that it is a valid pdf or djvu file. Tries all the`
			`available proxies sequentially. Returns the raw content of the file, or`
			`false if it could not be downloaded.`
			`"""`
Updated conf 2014-06-30 00:19:38 +02:00			`for proxy in config.get("proxies"):`
Rewrite to use PySocks 2014-08-02 23:34:34 +02:00			`if proxy.startswith('socks'):`
			`if proxy[5] == '4':`
			`proxy_type = socks.SOCKS4`
			`else:`
			`proxy_type = socks.SOCKS5`
			`proxy = proxy[proxy.find('://')+3:]`
			`try:`
			`proxy, port = proxy.split(':')`
			`except ValueError:`
			`port = None`
			`socks.set_default_proxy(proxy_type, proxy, port)`
Fix imports + subparsers in python3 2014-08-03 00:09:07 +02:00			`socket.socket = socks.socksocket`
			`elif proxy == '':`
			`socket.socket = default_socket`
			`else:`
Rewrite to use PySocks 2014-08-02 23:34:34 +02:00			`try:`
			`proxy, port = proxy.split(':')`
			`except ValueError:`
			`port = None`
			`socks.set_default_proxy(socks.HTTP, proxy, port)`
Fix imports + subparsers in python3 2014-08-03 00:09:07 +02:00			`socket.socket = socks.socksocket`
No fail if proxy is not accessible 2014-04-26 18:43:25 +02:00			`try:`
Rewrite to use PySocks 2014-08-02 23:34:34 +02:00			`r = urlopen(url)`
Further bugfixes for python3 2014-08-03 21:20:48 +02:00			`try:`
Small differences in between py2 and py3 2014-08-03 22:59:00 +02:00			`size = int(dict(r.info())['content-length'].strip())`
Further bugfixes for python3 2014-08-03 21:20:48 +02:00			`except KeyError:`
Small differences in between py2 and py3 2014-08-03 22:59:00 +02:00			`try:`
			`size = int(dict(r.info())['Content-Length'].strip())`
			`except KeyError:`
Fix for Python 3 2014-08-03 23:34:17 +02:00			`size = 0`
Further bugfixes for python3 2014-08-03 21:20:48 +02:00			`dl = b""`
Progress bar for download 2014-05-14 23:07:06 +02:00			`dl_size = 0`
Rewrite to use PySocks 2014-08-02 23:34:34 +02:00			`while True:`
			`buf = r.read(1024)`
Progress bar for download 2014-05-14 23:07:06 +02:00			`if buf:`
			`dl += buf`
			`dl_size += len(buf)`
Fix for Python 3 2014-08-03 23:34:17 +02:00			`if size != 0:`
			`done = int(50 * dl_size / size)`
			`sys.stdout.write("\r[%s%s]" % ('='done, ' '(50-done)))`
			`sys.stdout.write(" "+str(int(float(done)/52*100))+"%")`
			`sys.stdout.flush()`
Rewrite to use PySocks 2014-08-02 23:34:34 +02:00			`else:`
			`break`
Remove first page of IOP papers + various bugfixes 2014-04-26 23:26:25 +02:00			`contenttype = False`
Small differences in between py2 and py3 2014-08-03 22:59:00 +02:00			`contenttype_req = None`
Further bugfixes for python3 2014-08-03 21:20:48 +02:00			`try:`
Small differences in between py2 and py3 2014-08-03 22:59:00 +02:00			`contenttype_req = dict(r.info())['content-type']`
			`except KeyError:`
			`try:`
			`contenttype_req = dict(r.info())['Content-Type']`
			`except KeyError:`
			`continue`
			`try:`
			`if 'pdf' in contenttype_req:`
Further bugfixes for python3 2014-08-03 21:20:48 +02:00			`contenttype = 'pdf'`
Small differences in between py2 and py3 2014-08-03 22:59:00 +02:00			`elif 'djvu' in contenttype_req:`
Further bugfixes for python3 2014-08-03 21:20:48 +02:00			`contenttype = 'djvu'`
			`except KeyError:`
			`pass`
fix jstor pdf urls 2013-02-22 00:13:22 +01:00
Rewrite to use PySocks 2014-08-02 23:34:34 +02:00			`if r.getcode() != 200 or contenttype is False:`
No fail if proxy is not accessible 2014-04-26 18:43:25 +02:00			`continue`
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00
Progress bar for download 2014-05-14 23:07:06 +02:00			`return dl, contenttype`
Some unit test 2014-05-26 16:50:58 +02:00			`except ValueError:`
			`tools.warning("Invalid URL")`
Fixed bool is not iterable if URL is not fetchable 2014-06-30 11:05:03 +02:00			`return False, None`
Catch socket.error exceptions as seen in issue #20 2014-11-04 21:51:44 +01:00			`except (URLError, socket.error):`
			`if proxy != "":`
			`proxy_txt = "using proxy "+proxy`
			`else:`
			`proxy_txt = "without using any proxy"`
			`tools.warning("Unable to get "+url+" "+proxy_txt+". It " +`
			`"may not be available at the moment.")`
No fail if proxy is not accessible 2014-04-26 18:43:25 +02:00			`continue`
Fixed bool is not iterable if URL is not fetchable 2014-06-30 11:05:03 +02:00			`return False, None`
Beginning of refactor 2014-04-28 22:23:05 +02:00

Updated test files According to https://github.com/Phyks/BMC/issues/7. Also updated fetcher file to fix two bugs : * Whitespaces in ISBN * If PDF to text (or djvu to text) is not long enough, the end of the file was not processed. 2014-06-29 23:02:44 +02:00			`isbn_re = re.compile(r'isbn[\s]?:?[\s]?((?:[0-9]{3}[ -]?)?[0-9]{1,5}[ -]?[0-9]{1,7}[ -]?[0-9]{1,6}[- ]?[0-9])',`
Autoconfirm option + flake8 2014-05-17 17:23:56 +02:00			`re.IGNORECASE)`
Various re.compile 2014-04-29 21:55:35 +02:00
Some unit test 2014-05-26 16:50:58 +02:00
Beginning of refactor 2014-04-28 22:23:05 +02:00			`def findISBN(src):`
			`"""Search for a valid ISBN in src.`

			`Returns the ISBN or false if not found or an error occurred."""`
			`if src.endswith(".pdf"):`
			`totext = subprocess.Popen(["pdftotext", src, "-"],`
			`stdout=subprocess.PIPE,`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00			`stderr=subprocess.PIPE,`
			`bufsize=1)`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`elif src.endswith(".djvu"):`
			`totext = subprocess.Popen(["djvutxt", src],`
			`stdout=subprocess.PIPE,`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00			`stderr=subprocess.PIPE,`
			`bufsize=1)`
Bugfixes in fetcher.py + function to find hal id 2014-05-02 00:33:09 +02:00			`else:`
			`return False`

Flake8 2014-04-30 00:54:15 +02:00			`while totext.poll() is None:`
Further bugfixes for python3 2014-08-03 21:20:48 +02:00			`extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])`
Flake8 2014-04-30 00:54:15 +02:00			`extractISBN = isbn_re.search(extractfull.lower().replace('Œ',`
			`'-'))`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00			`if extractISBN:`
			`totext.terminate()`
			`break`

			`err = totext.communicate()[1]`
			`if totext.returncode > 0:`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`# Error happened`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00			`tools.warning(err)`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`return False`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00
Beginning of refactor 2014-04-28 22:23:05 +02:00			`cleanISBN = False`
			`# Clean ISBN is the ISBN number without separators`
			`if extractISBN:`
			`cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')`
			`return cleanISBN`


			`def isbn2Bib(isbn):`
Function for automatic find of arxiv id 2014-05-01 19:46:04 +02:00			`"""Tries to get bibtex entry from an ISBN number"""`
Import / Download / Delete working All bug should be fixed for the import / download / delete functions. * Some problems with utf-8 and homogeneize_latex_encoding in python-bbtexparser are bypassed and will be cleaned in a better way when the latest version will be available in pip. * Tweaked regex for isbn, which was not cas insensitive and forgot about spaces separated numbers. * File entry in arXiv bibtex is now deleted to avoid confusion. 2014-05-11 19:29:42 +02:00			`# Default merges results from worldcat.org and google books`
License + Consolidating fetcher.py 2014-05-26 16:12:21 +02:00			`try:`
Updated for new version of isbnlib 2014-07-08 21:37:13 +02:00			`return isbnlib.registry.bibformatters['bibtex'](isbnlib.meta(isbn,`
			`'default'))`
			`except (isbnlib.ISBNLibException, isbnlib.ISBNToolsException, TypeError):`
License + Consolidating fetcher.py 2014-05-26 16:12:21 +02:00			`return ''`
Beginning of refactor 2014-04-28 22:23:05 +02:00

Import / Download / Delete working All bug should be fixed for the import / download / delete functions. * Some problems with utf-8 and homogeneize_latex_encoding in python-bbtexparser are bypassed and will be cleaned in a better way when the latest version will be available in pip. * Tweaked regex for isbn, which was not cas insensitive and forgot about spaces separated numbers. * File entry in arXiv bibtex is now deleted to avoid confusion. 2014-05-11 19:29:42 +02:00			`doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)`
			`doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)`
			`doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)`
Various re.compile 2014-04-29 21:55:35 +02:00			`clean_doi_re = re.compile('^/')`
			`clean_doi_fabse_re = re.compile('^10.1096')`
			`clean_doi_jcb_re = re.compile('^10.1083')`
			`clean_doi_len_re = re.compile(r'\d\.\d')`
Solve issue #19 2014-11-04 21:11:19 +01:00			`arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)`
Various re.compile 2014-04-29 21:55:35 +02:00

Solve issue #19 2014-11-04 21:11:19 +01:00			`def findArticleID(src, only=["DOI", "arXiv"]):`
			`"""Search for a valid article ID (DOI or ArXiv) in src.`
Beginning of refactor 2014-04-28 22:23:05 +02:00
Solve issue #19 2014-11-04 21:11:19 +01:00			`Returns a tuple (type, first matching ID) or False if not found`
			`or an error occurred.`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/`
Solve issue #19 2014-11-04 21:11:19 +01:00			`and https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`"""`
			`if src.endswith(".pdf"):`
			`totext = subprocess.Popen(["pdftotext", src, "-"],`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE)`
			`elif src.endswith(".djvu"):`
			`totext = subprocess.Popen(["djvutxt", src],`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE)`
Bugfixes in fetcher.py + function to find hal id 2014-05-02 00:33:09 +02:00			`else:`
Solve issue #19 2014-11-04 21:11:19 +01:00			`return (False, False)`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00
License + Consolidating fetcher.py 2014-05-26 16:12:21 +02:00			`extractfull = ''`
Solve issue #19 2014-11-04 21:11:19 +01:00			`extract_type = False`
			`extractID = None`
Flake8 2014-04-30 00:54:15 +02:00			`while totext.poll() is None:`
Further bugfixes for python3 2014-08-03 21:20:48 +02:00			`extractfull += ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])`
Solve issue #19 2014-11-04 21:11:19 +01:00			`# Try to extract DOI`
			`if "DOI" in only:`
			`extractID = doi_re.search(extractfull.lower().replace('Œ', '-'))`
			`if not extractID:`
			`# PNAS fix`
			`extractID = doi_pnas_re.search(extractfull.`
			`lower().`
			`replace('pnas', '/pnas'))`
			`if not extractID:`
			`# JSB fix`
			`extractID = doi_jsb_re.search(extractfull.lower())`
			`if extractID:`
			`extract_type = "DOI"`
			`totext.terminate()`
			`# Try to extract arXiv`
			`if "arXiv" in only:`
			`tmp_extractID = arXiv_re.search(extractfull)`
			`if tmp_extractID:`
			`if not extractID or extractID.start(0) > tmp_extractID.start(1):`
			`# Only use arXiv id if it is before the DOI in the pdf`
			`extractID = tmp_extractID`
			`extract_type = "arXiv"`
			`totext.terminate()`
			`if extract_type is not False:`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00			`break`

			`err = totext.communicate()[1]`
			`if totext.returncode > 0:`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`# Error happened`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00			`tools.warning(err)`
Solve issue #19 2014-11-04 21:11:19 +01:00			`return (False, False)`
Beginning of refactor 2014-04-28 22:23:05 +02:00
Solve issue #19 2014-11-04 21:11:19 +01:00			`if extractID is not None and extract_type == "DOI":`
			`# If DOI extracted, clean it and return it`
			`cleanDOI = False`
			`cleanDOI = extractID.group(0).replace(':', '').replace(' ', '')`
Various re.compile 2014-04-29 21:55:35 +02:00			`if clean_doi_re.search(cleanDOI):`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`cleanDOI = cleanDOI[1:]`
			`# FABSE J fix`
Various re.compile 2014-04-29 21:55:35 +02:00			`if clean_doi_fabse_re.search(cleanDOI):`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`cleanDOI = cleanDOI[:20]`
			`# Second JCB fix`
Various re.compile 2014-04-29 21:55:35 +02:00			`if clean_doi_jcb_re.search(cleanDOI):`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`cleanDOI = cleanDOI[:21]`
			`if len(cleanDOI) > 40:`
Various re.compile 2014-04-29 21:55:35 +02:00			`cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI)`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`reps = {'.': 'A', '-': '0'}`
			`cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)`
			`digitStart = 0`
			`for i in range(len(cleanDOItemp)):`
			`if cleanDOItemp[i].isdigit():`
			`digitStart = 1`
			`if cleanDOItemp[i].isalpha() and digitStart:`
			`break`
			`cleanDOI = cleanDOI[0:(8+i)]`
Solve issue #19 2014-11-04 21:11:19 +01:00			`return ("DOI", cleanDOI)`
			`elif extractID is not None and extract_type == "arXiv":`
			`# If arXiv id is extracted, return it`
			`return ("arXiv", extractID.group(1))`
			`return (False, False)`
Beginning of refactor 2014-04-28 22:23:05 +02:00

			`def doi2Bib(doi):`
Function for automatic find of arxiv id 2014-05-01 19:46:04 +02:00			`"""Returns a bibTeX string of metadata for a given DOI.`
Beginning of refactor 2014-04-28 22:23:05 +02:00
			`From : https://gist.github.com/jrsmith3/5513926`
			`"""`
			`url = "http://dx.doi.org/" + doi`
			`headers = {"accept": "application/x-bibtex"}`
Flake8 fixes 2014-08-03 00:17:01 +02:00			`req = Request(url, headers=headers)`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`try:`
Flake8 fixes 2014-08-03 00:17:01 +02:00			`r = urlopen(req)`
Beginning of refactor 2014-04-28 22:23:05 +02:00
Further bugfixes for python3 2014-08-03 21:20:48 +02:00			`try:`
Small differences in between py2 and py3 2014-08-03 22:59:00 +02:00			`if dict(r.info())['content-type'] == 'application/x-bibtex':`
Fix 2014-08-03 21:52:01 +02:00			`return r.read().decode('utf-8')`
Further bugfixes for python3 2014-08-03 21:20:48 +02:00			`else:`
			`return ''`
			`except KeyError:`
Small differences in between py2 and py3 2014-08-03 22:59:00 +02:00			`try:`
			`if dict(r.info())['Content-Type'] == 'application/x-bibtex':`
			`return r.read().decode('utf-8')`
			`else:`
			`return ''`
			`except KeyError:`
			`return ''`
Flake8 fixes 2014-08-03 00:17:01 +02:00			`except URLError:`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`tools.warning('Unable to contact remote server to get the bibtex ' +`
			`'entry for doi '+doi)`
			`return ''`
Function for automatic find of arxiv id 2014-05-01 19:46:04 +02:00

			`def arXiv2Bib(arxiv):`
			`"""Returns bibTeX string of metadata for a given arXiv id`

			`arxiv is an arxiv id`
			`"""`
Functions to handle arXiv metadata 2014-05-02 00:07:49 +02:00			`bibtex = arxiv_metadata.arxiv2bib([arxiv])`
			`for bib in bibtex:`
			`if isinstance(bib, arxiv_metadata.ReferenceErrorInfo):`
			`continue`
			`else:`
Fix issue #21 2014-12-03 12:54:24 +01:00			`fetched_bibtex = bibtexparser.loads(bib.bibtex())`
			`fetched_bibtex = fetched_bibtex.entries_dict`
Further bugfixes for python3 2014-08-03 21:20:48 +02:00			`fetched_bibtex = fetched_bibtex[list(fetched_bibtex.keys())[0]]`
Import / Download / Delete working All bug should be fixed for the import / download / delete functions. * Some problems with utf-8 and homogeneize_latex_encoding in python-bbtexparser are bypassed and will be cleaned in a better way when the latest version will be available in pip. * Tweaked regex for isbn, which was not cas insensitive and forgot about spaces separated numbers. * File entry in arXiv bibtex is now deleted to avoid confusion. 2014-05-11 19:29:42 +02:00			`try:`
			`del(fetched_bibtex['file'])`
Cleaned too wide excepts 2014-07-01 20:31:19 +02:00			`except KeyError:`
Import / Download / Delete working All bug should be fixed for the import / download / delete functions. * Some problems with utf-8 and homogeneize_latex_encoding in python-bbtexparser are bypassed and will be cleaned in a better way when the latest version will be available in pip. * Tweaked regex for isbn, which was not cas insensitive and forgot about spaces separated numbers. * File entry in arXiv bibtex is now deleted to avoid confusion. 2014-05-11 19:29:42 +02:00			`pass`
			`return tools.parsed2Bibtex(fetched_bibtex)`
License + Consolidating fetcher.py 2014-05-26 16:12:21 +02:00			`return ''`
Bugfixes in fetcher.py + function to find hal id 2014-05-02 00:33:09 +02:00

			`HAL_re = re.compile(r'(hal-\d{8}), version (\d+)')`


			`def findHALId(src):`
			`"""Searches for a valid HAL id in src`

			`Returns a tuple of the HAL id and the version`
			`or False if not found or an error occurred.`
			`"""`
			`if src.endswith(".pdf"):`
			`totext = subprocess.Popen(["pdftotext", src, "-"],`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE)`
			`elif src.endswith(".djvu"):`
			`totext = subprocess.Popen(["djvutxt", src],`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE)`
			`else:`
			`return False`

			`while totext.poll() is None:`
Further bugfixes for python3 2014-08-03 21:20:48 +02:00			`extractfull = ' '.join([i.decode(stdout_encoding).strip() for i in totext.stdout.readlines()])`
Bugfixes in fetcher.py + function to find hal id 2014-05-02 00:33:09 +02:00			`extractID = HAL_re.search(extractfull)`
			`if extractID:`
			`totext.terminate()`
			`break`

			`err = totext.communicate()[1]`
			`if totext.returncode > 0:`
			`# Error happened`
			`tools.warning(err)`
			`return False`
			`else:`
			`return extractID.group(1), extractID.group(2)`