bmc/fetcher.py

#!/usr/bin/env python2
# coding=utf8

import isbntools
import re
import requesocks as requests  # Requesocks is requests with SOCKS support
import subprocess
import sys
import arxiv2bib as arxiv_metadata
import tools
import params
from bibtexparser.bparser import BibTexParser
from isbntools.dev.fmt import fmtbib


def download(url):
    """Download url tofile

    Check that it is a valid pdf or djvu file. Tries all the
    available proxies sequentially. Returns the raw content of the file, or
    false if it could not be downloaded.
    """
    for proxy in params.proxies:
        r_proxy = {
            "http": proxy,
            "https": proxy,
        }
        try:
            r = requests.get(url, proxies=r_proxy)
            size = int(r.headers['Content-Length'].strip())
            dl = ""
            dl_size = 0
            for buf in r.iter_content(1024):
                if buf:
                    dl += buf
                    dl_size += len(buf)
                    done = int(50 * dl_size / size)
                    sys.stdout.write("\r[%s%s]"%('='*done,' '*(50-done)))
                    sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
                    sys.stdout.flush()
            contenttype = False
            if 'pdf' in r.headers['content-type']:
                contenttype = 'pdf'
            elif 'djvu' in r.headers['content-type']:
                contenttype = 'djvu'

            if r.status_code != 200 or contenttype is False:
                continue

            return dl, contenttype
        except requests.exceptions.RequestException:
            tools.warning("Unable to get "+url+" using proxy "+proxy+". It " +
                          "may not be available.")
            continue
    return False


isbn_re = re.compile(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[ -][0-9])",
                    re.IGNORECASE)


def findISBN(src):
    """Search for a valid ISBN in src.

    Returns the ISBN or false if not found or an error occurred."""
    if src.endswith(".pdf"):
        totext = subprocess.Popen(["pdftotext", src, "-"],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE,
                                  bufsize=1)
    elif src.endswith(".djvu"):
        totext = subprocess.Popen(["djvutxt", src],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE,
                                  bufsize=1)
    else:
        return False

    while totext.poll() is None:
        extractfull = totext.stdout.readline()
        extractISBN = isbn_re.search(extractfull.lower().replace('&#338;',
                                                                 '-'))
        if extractISBN:
            totext.terminate()
            break

    err = totext.communicate()[1]
    if totext.returncode > 0:
        # Error happened
        tools.warning(err)
        return False

    cleanISBN = False
    # Clean ISBN is the ISBN number without separators
    if extractISBN:
        cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')
    return cleanISBN


def isbn2Bib(isbn):
    """Tries to get bibtex entry from an ISBN number"""
    # Default merges results from worldcat.org and google books
    return fmtbib('bibtex', isbntools.meta(isbn, 'default'))


doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)
doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)
doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)
clean_doi_re = re.compile('^/')
clean_doi_fabse_re = re.compile('^10.1096')
clean_doi_jcb_re = re.compile('^10.1083')
clean_doi_len_re = re.compile(r'\d\.\d')


def findDOI(src):
    """Search for a valid DOI in src.

    Returns the DOI or False if not found or an error occurred.
    From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/
    """
    if src.endswith(".pdf"):
        totext = subprocess.Popen(["pdftotext", src, "-"],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
    elif src.endswith(".djvu"):
        totext = subprocess.Popen(["djvutxt", src],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
    else:
        return False

    while totext.poll() is None:
        extractfull = totext.stdout.readline()
        extractDOI = doi_re.search(extractfull.lower().replace('&#338;', '-'))
        if not extractDOI:
            # PNAS fix
            extractDOI = doi_pnas_re.search(extractfull.lower().replace('pnas',
                                                                        '/pnas'))
            if not extractDOI:
                # JSB fix
                extractDOI = doi_jsb_re.search(extractfull.lower())
        if extractDOI:
            totext.terminate()
            break

    err = totext.communicate()[1]
    if totext.returncode > 0:
        # Error happened
        tools.warning(err)
        return False

    cleanDOI = False
    if extractDOI:
        cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')
        if clean_doi_re.search(cleanDOI):
            cleanDOI = cleanDOI[1:]
        # FABSE J fix
        if clean_doi_fabse_re.search(cleanDOI):
            cleanDOI = cleanDOI[:20]
        # Second JCB fix
        if clean_doi_jcb_re.search(cleanDOI):
            cleanDOI = cleanDOI[:21]
        if len(cleanDOI) > 40:
            cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI)
            reps = {'.': 'A', '-': '0'}
            cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)
            digitStart = 0
            for i in range(len(cleanDOItemp)):
                if cleanDOItemp[i].isdigit():
                    digitStart = 1
                    if cleanDOItemp[i].isalpha() and digitStart:
                        break
            cleanDOI = cleanDOI[0:(8+i)]
    return cleanDOI


def doi2Bib(doi):
    """Returns a bibTeX string of metadata for a given DOI.

    From : https://gist.github.com/jrsmith3/5513926
    """
    url = "http://dx.doi.org/" + doi
    headers = {"accept": "application/x-bibtex"}
    try:
        r = requests.get(url, headers=headers)

        if r.headers['content-type'] == 'application/x-bibtex':
            return r.text
        else:
            return ''
    except requests.exceptions.ConnectionError:
        tools.warning('Unable to contact remote server to get the bibtex ' +
                      'entry for doi '+doi)
        return ''


arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)


def findArXivId(src):
    """Searches for a valid arXiv id in src.

    Returns the arXiv id or False if not found or an error occurred.
    From : https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb
    """
    if src.endswith(".pdf"):
        totext = subprocess.Popen(["pdftotext", src, "-"],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
    elif src.endswith(".djvu"):
        totext = subprocess.Popen(["djvutxt", src],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
    else:
        return False

    while totext.poll() is None:
        extractfull = totext.stdout.readline()
        extractID = arXiv_re.search(extractfull)
        if extractID:
            totext.terminate()
            break

    err = totext.communicate()[1]
    if totext.returncode > 0:
        # Error happened
        tools.warning(err)
        return False
    elif extractID is not None:
        return extractID.group(1)
    else:
        return False


def arXiv2Bib(arxiv):
    """Returns bibTeX string of metadata for a given arXiv id

    arxiv is an arxiv id
    """
    bibtex = arxiv_metadata.arxiv2bib([arxiv])
    for bib in bibtex:
        if isinstance(bib, arxiv_metadata.ReferenceErrorInfo):
            continue
        else:
            fetched_bibtex = BibTexParser(bib.bibtex())
            fetched_bibtex = fetched_bibtex.get_entry_dict()
            fetched_bibtex = fetched_bibtex[fetched_bibtex.keys()[0]]
            try:
                del(fetched_bibtex['file'])
            except:
                pass
            return tools.parsed2Bibtex(fetched_bibtex)
    return False


HAL_re = re.compile(r'(hal-\d{8}), version (\d+)')


def findHALId(src):
    """Searches for a valid HAL id in src

    Returns a tuple of the HAL id and the version
    or False if not found or an error occurred.
    """
    if src.endswith(".pdf"):
        totext = subprocess.Popen(["pdftotext", src, "-"],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
    elif src.endswith(".djvu"):
        totext = subprocess.Popen(["djvutxt", src],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
    else:
        return False

    while totext.poll() is None:
        extractfull = totext.stdout.readline()
        extractID = HAL_re.search(extractfull)
        if extractID:
            totext.terminate()
            break

    err = totext.communicate()[1]
    if totext.returncode > 0:
        # Error happened
        tools.warning(err)
        return False
    else:
        return extractID.group(1), extractID.group(2)
Beginning of refactor 2014-04-28 22:23:05 +02:00			`#!/usr/bin/env python2`
Config file, SOCKS support, multiple servers 2013-05-11 16:10:48 +02:00			`# coding=utf8`
Download of papers working You should pass the url of the pdf file to the script, along with the `download` parameter. It will try the proxies in the `params.py` file, until it finds one that allow him to get the pdf file. TODO : Use pdfparanoia to remove watermarks 2014-04-26 11:52:19 +02:00
Beginning of refactor 2014-04-28 22:23:05 +02:00			`import isbntools`
			`import re`
			`import requesocks as requests # Requesocks is requests with SOCKS support`
			`import subprocess`
Progress bar for download 2014-05-14 23:07:06 +02:00			`import sys`
Functions to handle arXiv metadata 2014-05-02 00:07:49 +02:00			`import arxiv2bib as arxiv_metadata`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`import tools`
Download of papers working You should pass the url of the pdf file to the script, along with the `download` parameter. It will try the proxies in the `params.py` file, until it finds one that allow him to get the pdf file. TODO : Use pdfparanoia to remove watermarks 2014-04-26 11:52:19 +02:00			`import params`
Import / Download / Delete working All bug should be fixed for the import / download / delete functions. * Some problems with utf-8 and homogeneize_latex_encoding in python-bbtexparser are bypassed and will be cleaned in a better way when the latest version will be available in pip. * Tweaked regex for isbn, which was not cas insensitive and forgot about spaces separated numbers. * File entry in arXiv bibtex is now deleted to avoid confusion. 2014-05-11 19:29:42 +02:00			`from bibtexparser.bparser import BibTexParser`
			`from isbntools.dev.fmt import fmtbib`
handle ieee xplore login.jsp urls 2013-01-22 02:11:12 +01:00
No fail if proxy is not accessible 2014-04-26 18:43:25 +02:00
Beginning of refactor 2014-04-28 22:23:05 +02:00			`def download(url):`
			`"""Download url tofile`
No fail if proxy is not accessible 2014-04-26 18:43:25 +02:00
Beginning of refactor 2014-04-28 22:23:05 +02:00			`Check that it is a valid pdf or djvu file. Tries all the`
			`available proxies sequentially. Returns the raw content of the file, or`
			`false if it could not be downloaded.`
			`"""`
Download of papers working You should pass the url of the pdf file to the script, along with the `download` parameter. It will try the proxies in the `params.py` file, until it finds one that allow him to get the pdf file. TODO : Use pdfparanoia to remove watermarks 2014-04-26 11:52:19 +02:00			`for proxy in params.proxies:`
			`r_proxy = {`
			`"http": proxy,`
			`"https": proxy,`
			`}`
No fail if proxy is not accessible 2014-04-26 18:43:25 +02:00			`try:`
			`r = requests.get(url, proxies=r_proxy)`
Progress bar for download 2014-05-14 23:07:06 +02:00			`size = int(r.headers['Content-Length'].strip())`
			`dl = ""`
			`dl_size = 0`
			`for buf in r.iter_content(1024):`
			`if buf:`
			`dl += buf`
			`dl_size += len(buf)`
			`done = int(50 * dl_size / size)`
			`sys.stdout.write("\r[%s%s]"%('='done,' '(50-done)))`
			`sys.stdout.write(" "+str(int(float(done)/52*100))+"%")`
			`sys.stdout.flush()`
Remove first page of IOP papers + various bugfixes 2014-04-26 23:26:25 +02:00			`contenttype = False`
			`if 'pdf' in r.headers['content-type']:`
			`contenttype = 'pdf'`
			`elif 'djvu' in r.headers['content-type']:`
			`contenttype = 'djvu'`
fix jstor pdf urls 2013-02-22 00:13:22 +01:00
Remove first page of IOP papers + various bugfixes 2014-04-26 23:26:25 +02:00			`if r.status_code != 200 or contenttype is False:`
No fail if proxy is not accessible 2014-04-26 18:43:25 +02:00			`continue`
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00
Progress bar for download 2014-05-14 23:07:06 +02:00			`return dl, contenttype`
Refactor in progress TODO : * Use new API from bibtex-parser * Split addFile / downloadFile in main to a function in main (interface) and one in backend 2014-04-28 23:35:06 +02:00			`except requests.exceptions.RequestException:`
Download working 2014-05-09 23:37:17 +02:00			`tools.warning("Unable to get "+url+" using proxy "+proxy+". It " +`
Refactor in progress TODO : * Use new API from bibtex-parser * Split addFile / downloadFile in main to a function in main (interface) and one in backend 2014-04-28 23:35:06 +02:00			`"may not be available.")`
No fail if proxy is not accessible 2014-04-26 18:43:25 +02:00			`continue`
Download of papers working You should pass the url of the pdf file to the script, along with the `download` parameter. It will try the proxies in the `params.py` file, until it finds one that allow him to get the pdf file. TODO : Use pdfparanoia to remove watermarks 2014-04-26 11:52:19 +02:00			`return False`
Beginning of refactor 2014-04-28 22:23:05 +02:00

Import / Download / Delete working All bug should be fixed for the import / download / delete functions. * Some problems with utf-8 and homogeneize_latex_encoding in python-bbtexparser are bypassed and will be cleaned in a better way when the latest version will be available in pip. * Tweaked regex for isbn, which was not cas insensitive and forgot about spaces separated numbers. * File entry in arXiv bibtex is now deleted to avoid confusion. 2014-05-11 19:29:42 +02:00			`isbn_re = re.compile(r"isbn (([0-9]{3}[ -])?[0-9][ -][0-9]{2}[ -][0-9]{6}[ -][0-9])",`
			`re.IGNORECASE)`
Various re.compile 2014-04-29 21:55:35 +02:00

Beginning of refactor 2014-04-28 22:23:05 +02:00			`def findISBN(src):`
			`"""Search for a valid ISBN in src.`

			`Returns the ISBN or false if not found or an error occurred."""`
			`if src.endswith(".pdf"):`
			`totext = subprocess.Popen(["pdftotext", src, "-"],`
			`stdout=subprocess.PIPE,`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00			`stderr=subprocess.PIPE,`
			`bufsize=1)`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`elif src.endswith(".djvu"):`
			`totext = subprocess.Popen(["djvutxt", src],`
			`stdout=subprocess.PIPE,`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00			`stderr=subprocess.PIPE,`
			`bufsize=1)`
Bugfixes in fetcher.py + function to find hal id 2014-05-02 00:33:09 +02:00			`else:`
			`return False`

Flake8 2014-04-30 00:54:15 +02:00			`while totext.poll() is None:`
Bugfixes in fetcher.py + function to find hal id 2014-05-02 00:33:09 +02:00			`extractfull = totext.stdout.readline()`
Flake8 2014-04-30 00:54:15 +02:00			`extractISBN = isbn_re.search(extractfull.lower().replace('Œ',`
			`'-'))`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00			`if extractISBN:`
			`totext.terminate()`
			`break`

			`err = totext.communicate()[1]`
			`if totext.returncode > 0:`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`# Error happened`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00			`tools.warning(err)`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`return False`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00
Beginning of refactor 2014-04-28 22:23:05 +02:00			`cleanISBN = False`
			`# Clean ISBN is the ISBN number without separators`
			`if extractISBN:`
			`cleanISBN = extractISBN.group(1).replace('-', '').replace(' ', '')`
			`return cleanISBN`


			`def isbn2Bib(isbn):`
Function for automatic find of arxiv id 2014-05-01 19:46:04 +02:00			`"""Tries to get bibtex entry from an ISBN number"""`
Import / Download / Delete working All bug should be fixed for the import / download / delete functions. * Some problems with utf-8 and homogeneize_latex_encoding in python-bbtexparser are bypassed and will be cleaned in a better way when the latest version will be available in pip. * Tweaked regex for isbn, which was not cas insensitive and forgot about spaces separated numbers. * File entry in arXiv bibtex is now deleted to avoid confusion. 2014-05-11 19:29:42 +02:00			`# Default merges results from worldcat.org and google books`
			`return fmtbib('bibtex', isbntools.meta(isbn, 'default'))`
Beginning of refactor 2014-04-28 22:23:05 +02:00

Import / Download / Delete working All bug should be fixed for the import / download / delete functions. * Some problems with utf-8 and homogeneize_latex_encoding in python-bbtexparser are bypassed and will be cleaned in a better way when the latest version will be available in pip. * Tweaked regex for isbn, which was not cas insensitive and forgot about spaces separated numbers. * File entry in arXiv bibtex is now deleted to avoid confusion. 2014-05-11 19:29:42 +02:00			`doi_re = re.compile('(?<=doi)/?:?\s?[0-9\.]{7}/\S*[0-9]', re.IGNORECASE)`
			`doi_pnas_re = re.compile('(?<=doi).?10.1073/pnas\.\d+', re.IGNORECASE)`
			`doi_jsb_re = re.compile('10\.1083/jcb\.\d{9}', re.IGNORECASE)`
Various re.compile 2014-04-29 21:55:35 +02:00			`clean_doi_re = re.compile('^/')`
			`clean_doi_fabse_re = re.compile('^10.1096')`
			`clean_doi_jcb_re = re.compile('^10.1083')`
			`clean_doi_len_re = re.compile(r'\d\.\d')`


Beginning of refactor 2014-04-28 22:23:05 +02:00			`def findDOI(src):`
			`"""Search for a valid DOI in src.`

			`Returns the DOI or False if not found or an error occurred.`
			`From : http://en.dogeno.us/2010/02/release-a-python-script-for-organizing-scientific-papers-pyrenamepdf-py/`
			`"""`
			`if src.endswith(".pdf"):`
			`totext = subprocess.Popen(["pdftotext", src, "-"],`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE)`
			`elif src.endswith(".djvu"):`
			`totext = subprocess.Popen(["djvutxt", src],`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE)`
Bugfixes in fetcher.py + function to find hal id 2014-05-02 00:33:09 +02:00			`else:`
			`return False`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00
Flake8 2014-04-30 00:54:15 +02:00			`while totext.poll() is None:`
Bugfixes in fetcher.py + function to find hal id 2014-05-02 00:33:09 +02:00			`extractfull = totext.stdout.readline()`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00			`extractDOI = doi_re.search(extractfull.lower().replace('Œ', '-'))`
			`if not extractDOI:`
			`# PNAS fix`
			`extractDOI = doi_pnas_re.search(extractfull.lower().replace('pnas',`
			`'/pnas'))`
			`if not extractDOI:`
			`# JSB fix`
			`extractDOI = doi_jsb_re.search(extractfull.lower())`
			`if extractDOI:`
			`totext.terminate()`
			`break`

			`err = totext.communicate()[1]`
			`if totext.returncode > 0:`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`# Error happened`
Check output before processing the whole file for ISBN and DOI search 2014-04-30 00:36:15 +02:00			`tools.warning(err)`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`return False`

			`cleanDOI = False`
			`if extractDOI:`
			`cleanDOI = extractDOI.group(0).replace(':', '').replace(' ', '')`
Various re.compile 2014-04-29 21:55:35 +02:00			`if clean_doi_re.search(cleanDOI):`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`cleanDOI = cleanDOI[1:]`
			`# FABSE J fix`
Various re.compile 2014-04-29 21:55:35 +02:00			`if clean_doi_fabse_re.search(cleanDOI):`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`cleanDOI = cleanDOI[:20]`
			`# Second JCB fix`
Various re.compile 2014-04-29 21:55:35 +02:00			`if clean_doi_jcb_re.search(cleanDOI):`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`cleanDOI = cleanDOI[:21]`
			`if len(cleanDOI) > 40:`
Various re.compile 2014-04-29 21:55:35 +02:00			`cleanDOItemp = clean_doi_len_re.sub('000', cleanDOI)`
Beginning of refactor 2014-04-28 22:23:05 +02:00			`reps = {'.': 'A', '-': '0'}`
			`cleanDOItemp = tools.replaceAll(cleanDOItemp[8:], reps)`
			`digitStart = 0`
			`for i in range(len(cleanDOItemp)):`
			`if cleanDOItemp[i].isdigit():`
			`digitStart = 1`
			`if cleanDOItemp[i].isalpha() and digitStart:`
			`break`
			`cleanDOI = cleanDOI[0:(8+i)]`
			`return cleanDOI`


			`def doi2Bib(doi):`
Function for automatic find of arxiv id 2014-05-01 19:46:04 +02:00			`"""Returns a bibTeX string of metadata for a given DOI.`
Beginning of refactor 2014-04-28 22:23:05 +02:00
			`From : https://gist.github.com/jrsmith3/5513926`
			`"""`
			`url = "http://dx.doi.org/" + doi`
			`headers = {"accept": "application/x-bibtex"}`
			`try:`
			`r = requests.get(url, headers=headers)`

			`if r.headers['content-type'] == 'application/x-bibtex':`
			`return r.text`
			`else:`
			`return ''`
			`except requests.exceptions.ConnectionError:`
			`tools.warning('Unable to contact remote server to get the bibtex ' +`
			`'entry for doi '+doi)`
			`return ''`
Function for automatic find of arxiv id 2014-05-01 19:46:04 +02:00

Import / Download / Delete working All bug should be fixed for the import / download / delete functions. * Some problems with utf-8 and homogeneize_latex_encoding in python-bbtexparser are bypassed and will be cleaned in a better way when the latest version will be available in pip. * Tweaked regex for isbn, which was not cas insensitive and forgot about spaces separated numbers. * File entry in arXiv bibtex is now deleted to avoid confusion. 2014-05-11 19:29:42 +02:00			`arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)', re.IGNORECASE)`
Function for automatic find of arxiv id 2014-05-01 19:46:04 +02:00

			`def findArXivId(src):`
Bugfixes in fetcher.py + function to find hal id 2014-05-02 00:33:09 +02:00			`"""Searches for a valid arXiv id in src.`
Function for automatic find of arxiv id 2014-05-01 19:46:04 +02:00
			`Returns the arXiv id or False if not found or an error occurred.`
			`From : https://github.com/minad/bibsync/blob/3fdf121016f6187a2fffc66a73cd33b45a20e55d/lib/bibsync/utils.rb`
			`"""`
			`if src.endswith(".pdf"):`
			`totext = subprocess.Popen(["pdftotext", src, "-"],`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE)`
			`elif src.endswith(".djvu"):`
			`totext = subprocess.Popen(["djvutxt", src],`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE)`
Bugfixes in fetcher.py + function to find hal id 2014-05-02 00:33:09 +02:00			`else:`
			`return False`
Function for automatic find of arxiv id 2014-05-01 19:46:04 +02:00
			`while totext.poll() is None:`
Bugfixes in fetcher.py + function to find hal id 2014-05-02 00:33:09 +02:00			`extractfull = totext.stdout.readline()`
Function for automatic find of arxiv id 2014-05-01 19:46:04 +02:00			`extractID = arXiv_re.search(extractfull)`
			`if extractID:`
			`totext.terminate()`
			`break`

			`err = totext.communicate()[1]`
			`if totext.returncode > 0:`
			`# Error happened`
			`tools.warning(err)`
			`return False`
Import / Download / Delete working All bug should be fixed for the import / download / delete functions. * Some problems with utf-8 and homogeneize_latex_encoding in python-bbtexparser are bypassed and will be cleaned in a better way when the latest version will be available in pip. * Tweaked regex for isbn, which was not cas insensitive and forgot about spaces separated numbers. * File entry in arXiv bibtex is now deleted to avoid confusion. 2014-05-11 19:29:42 +02:00			`elif extractID is not None:`
Bugfixes in fetcher.py + function to find hal id 2014-05-02 00:33:09 +02:00			`return extractID.group(1)`
Import / Download / Delete working All bug should be fixed for the import / download / delete functions. * Some problems with utf-8 and homogeneize_latex_encoding in python-bbtexparser are bypassed and will be cleaned in a better way when the latest version will be available in pip. * Tweaked regex for isbn, which was not cas insensitive and forgot about spaces separated numbers. * File entry in arXiv bibtex is now deleted to avoid confusion. 2014-05-11 19:29:42 +02:00			`else:`
			`return False`
Function for automatic find of arxiv id 2014-05-01 19:46:04 +02:00

			`def arXiv2Bib(arxiv):`
			`"""Returns bibTeX string of metadata for a given arXiv id`

			`arxiv is an arxiv id`
			`"""`
Functions to handle arXiv metadata 2014-05-02 00:07:49 +02:00			`bibtex = arxiv_metadata.arxiv2bib([arxiv])`
			`for bib in bibtex:`
			`if isinstance(bib, arxiv_metadata.ReferenceErrorInfo):`
			`continue`
			`else:`
Import / Download / Delete working All bug should be fixed for the import / download / delete functions. * Some problems with utf-8 and homogeneize_latex_encoding in python-bbtexparser are bypassed and will be cleaned in a better way when the latest version will be available in pip. * Tweaked regex for isbn, which was not cas insensitive and forgot about spaces separated numbers. * File entry in arXiv bibtex is now deleted to avoid confusion. 2014-05-11 19:29:42 +02:00			`fetched_bibtex = BibTexParser(bib.bibtex())`
			`fetched_bibtex = fetched_bibtex.get_entry_dict()`
			`fetched_bibtex = fetched_bibtex[fetched_bibtex.keys()[0]]`
			`try:`
			`del(fetched_bibtex['file'])`
			`except:`
			`pass`
			`return tools.parsed2Bibtex(fetched_bibtex)`
Functions to handle arXiv metadata 2014-05-02 00:07:49 +02:00			`return False`
Bugfixes in fetcher.py + function to find hal id 2014-05-02 00:33:09 +02:00

			`HAL_re = re.compile(r'(hal-\d{8}), version (\d+)')`


			`def findHALId(src):`
			`"""Searches for a valid HAL id in src`

			`Returns a tuple of the HAL id and the version`
			`or False if not found or an error occurred.`
			`"""`
			`if src.endswith(".pdf"):`
			`totext = subprocess.Popen(["pdftotext", src, "-"],`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE)`
			`elif src.endswith(".djvu"):`
			`totext = subprocess.Popen(["djvutxt", src],`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE)`
			`else:`
			`return False`

			`while totext.poll() is None:`
			`extractfull = totext.stdout.readline()`
			`extractID = HAL_re.search(extractfull)`
			`if extractID:`
			`totext.terminate()`
			`break`

			`err = totext.communicate()[1]`
			`if totext.returncode > 0:`
			`# Error happened`
			`tools.warning(err)`
			`return False`
			`else:`
			`return extractID.group(1), extractID.group(2)`