Passing pylint on the module

This commit is contained in:
Lucas Verney 2016-02-17 16:04:36 +01:00
parent a69e7ef6c1
commit a2ee654eac
16 changed files with 282 additions and 262 deletions

View File

@ -1,15 +1,11 @@
"""
libbmc
The :mod:`libbmc` is a generic Python library to manage bibliography and play
with scientific papers.
"""
# Global list of valid paper identifier types. See README.md. # Global list of valid paper identifier types. See README.md.
__valid_identifiers__ = [] __valid_identifiers__ = []
# Import order of the modules is important, as they will populate
# `__valid_identifiers__` on load, and the order in this list reflects their
# priority.
from libbmc import bibtex, doi, fetcher, isbn # noqa
from libbmc import citations, papers, repositories # noqa
__version__ = "0.1.3.1" __version__ = "0.1.3.1"
__all__ = [
"bibtex", "doi", "fetcher", "isbn",
"citations", "papers", "repositories",
]

View File

@ -1,5 +0,0 @@
from libbmc.citations import bbl, bibtex, pdf, plaintext
__all__ = [
"bbl", "bibtex", "pdf", "plaintext"
]

View File

@ -73,7 +73,7 @@ def get_plaintext_citations(bbl):
return cleaned_bbl return cleaned_bbl
def get_cited_DOIs(bbl): def get_cited_dois(bbl):
""" """
Get the DOIs of the papers cited in a .bbl file. Get the DOIs of the papers cited in a .bbl file.
@ -85,4 +85,4 @@ def get_cited_DOIs(bbl):
# Get the plaintext citations from the bbl file # Get the plaintext citations from the bbl file
plaintext_citations = get_plaintext_citations(bbl) plaintext_citations = get_plaintext_citations(bbl)
# Use the plaintext citations parser on these citations # Use the plaintext citations parser on these citations
return plaintext.get_cited_DOIs(plaintext_citations) return plaintext.get_cited_dois(plaintext_citations)

View File

@ -2,15 +2,20 @@
This files contains all the functions to extract DOIs of citations from This files contains all the functions to extract DOIs of citations from
BibTeX files. BibTeX files.
""" """
import bibtexparser
import os import os
import bibtexparser
from bibtexparser.bparser import BibTexParser from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import convert_to_unicode from bibtexparser.customization import convert_to_unicode
from libbmc import tools from libbmc import tools
from libbmc.citations import plaintext from libbmc.citations import plaintext
# TODO: Use beta.dissem.in with formatted citation
def bibentry_as_plaintext(bibentry): def bibentry_as_plaintext(bibentry):
""" """
@ -51,7 +56,7 @@ def get_plaintext_citations(bibtex):
return bibentries return bibentries
def get_cited_DOIs(bibtex): def get_cited_dois(bibtex):
""" """
Get the DOIs of the papers cited in a BibTeX file. Get the DOIs of the papers cited in a BibTeX file.
@ -71,4 +76,4 @@ def get_cited_DOIs(bibtex):
# Get the plaintext citations from the bibtex file # Get the plaintext citations from the bibtex file
plaintext_citations = get_plaintext_citations(bibtex) plaintext_citations = get_plaintext_citations(bibtex)
# Use the plaintext citations parser on these citations # Use the plaintext citations parser on these citations
return plaintext.get_cited_DOIs(plaintext_citations) return plaintext.get_cited_dois(plaintext_citations)

View File

@ -3,10 +3,11 @@ This files contains all the functions to extract DOIs of citations from
PDF files. PDF files.
""" """
import os import os
import requests
import subprocess import subprocess
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import requests
from requests.exceptions import RequestException from requests.exceptions import RequestException
from libbmc import tools from libbmc import tools
@ -17,7 +18,7 @@ CERMINE_BASE_URL = "http://cermine.ceon.pl/"
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
def cermine(pdf_file, force_API=False, override_local=None): def cermine(pdf_file, force_api=False, override_local=None):
""" """
Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract metadata from \ Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract metadata from \
the given PDF file, to retrieve citations (and more) from the \ the given PDF file, to retrieve citations (and more) from the \
@ -44,7 +45,7 @@ def cermine(pdf_file, force_API=False, override_local=None):
the CERMINE API terms. the CERMINE API terms.
:param pdf_file: Path to the PDF file to handle. :param pdf_file: Path to the PDF file to handle.
:param force_API: Force the use of the Cermine API \ :param force_api: Force the use of the Cermine API \
(and do not try to use a local JAR file). Defaults to ``False``. (and do not try to use a local JAR file). Defaults to ``False``.
:param override_local: Use this specific JAR file, instead of the one at \ :param override_local: Use this specific JAR file, instead of the one at \
the default location (``libbmc/external/cermine.jar``). the default location (``libbmc/external/cermine.jar``).
@ -55,23 +56,23 @@ def cermine(pdf_file, force_API=False, override_local=None):
# Check if we want to load the local JAR from a specific path # Check if we want to load the local JAR from a specific path
local = override_local local = override_local
# Else, try to stat the JAR file at the expected local path # Else, try to stat the JAR file at the expected local path
if (local is None) and (not force_API): if (local is None) and (not force_api):
if os.path.isfile(os.path.join(SCRIPT_DIR, if os.path.isfile(os.path.join(SCRIPT_DIR,
"../external/cermine.jar")): "../external/cermine.jar")):
local = os.path.join(SCRIPT_DIR, local = os.path.join(SCRIPT_DIR,
"../external/cermine.jar") "../external/cermine.jar")
# If we want to force the API use, or we could not get a local JAR # If we want to force the API use, or we could not get a local JAR
if force_API or (local is None): if force_api or (local is None):
print("Using API") print("Using API")
with open(pdf_file, "rb") as fh: with open(pdf_file, "rb") as fh:
# Query the API # Query the API
r = requests.post( request = requests.post(
CERMINE_BASE_URL + "extract.do", CERMINE_BASE_URL + "extract.do",
headers={"Content-Type": "application/binary"}, headers={"Content-Type": "application/binary"},
files={"file": fh} files={"file": fh}
) )
return r.text return request.text
# Else, use the local JAR file # Else, use the local JAR file
else: else:
return subprocess.check_output([ return subprocess.check_output([
@ -86,7 +87,7 @@ def cermine(pdf_file, force_API=False, override_local=None):
return None return None
def cermine_dois(pdf_file, force_API=False, override_local=None): def cermine_dois(pdf_file, force_api=False, override_local=None):
""" """
Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract DOIs of cited \ Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract DOIs of cited \
papers from a PDF file. papers from a PDF file.
@ -116,7 +117,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
try to match them on Crossref to get DOIs. try to match them on Crossref to get DOIs.
:param pdf_file: Path to the PDF file to handle. :param pdf_file: Path to the PDF file to handle.
:param force_API: Force the use of the Cermine API \ :param force_api: Force the use of the Cermine API \
(and do not try to use a local JAR file). Defaults to ``False``. (and do not try to use a local JAR file). Defaults to ``False``.
:param override_local: Use this specific JAR file, instead of the one at \ :param override_local: Use this specific JAR file, instead of the one at \
the default location (``libbmc/external/cermine.jar``). the default location (``libbmc/external/cermine.jar``).
@ -126,7 +127,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
# * Do not convert to plain text, but use the extra metadata from # * Do not convert to plain text, but use the extra metadata from
# CERMINE # CERMINE
# Call CERMINE on the PDF file # Call CERMINE on the PDF file
cermine_output = cermine(pdf_file, force_API, override_local) cermine_output = cermine(pdf_file, force_api, override_local)
# Parse the resulting XML # Parse the resulting XML
root = ET.fromstring(cermine_output) root = ET.fromstring(cermine_output)
plaintext_references = [ plaintext_references = [
@ -136,7 +137,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
ET.tostring(e, method="text").decode("utf-8").replace(e.text, "")) ET.tostring(e, method="text").decode("utf-8").replace(e.text, ""))
for e in root.iter("mixed-citation")] for e in root.iter("mixed-citation")]
# Call the plaintext methods to fetch DOIs # Call the plaintext methods to fetch DOIs
return plaintext.get_cited_DOIs(plaintext_references) return plaintext.get_cited_dois(plaintext_references)
def grobid(pdf_folder, grobid_home=None, grobid_jar=None): def grobid(pdf_folder, grobid_home=None, grobid_jar=None):
@ -156,6 +157,8 @@ def grobid(pdf_folder, grobid_home=None, grobid_jar=None):
:param grobid_jar: Path to the built Grobid JAR file. :param grobid_jar: Path to the built Grobid JAR file.
:returns: ``True``, or ``False`` if an error occurred. :returns: ``True``, or ``False`` if an error occurred.
""" """
# TODO: Should be using https://github.com/kermitt2/grobid-example and
# BibTeX backend.
if grobid_home is None or grobid_jar is None: if grobid_home is None or grobid_jar is None:
# User should pass the correct paths # User should pass the correct paths
return False return False
@ -234,4 +237,4 @@ def pdfextract_dois(pdf_file):
root = ET.fromstring(references) root = ET.fromstring(references)
plaintext_references = [e.text for e in root.iter("reference")] plaintext_references = [e.text for e in root.iter("reference")]
# Call the plaintext methods to fetch DOIs # Call the plaintext methods to fetch DOIs
return plaintext.get_cited_DOIs(plaintext_references) return plaintext.get_cited_dois(plaintext_references)

View File

@ -37,7 +37,7 @@ def get_plaintext_citations(file):
return cleaned_citations return cleaned_citations
def get_cited_DOIs(file): def get_cited_dois(file):
""" """
Get the DOIs of the papers cited in a plaintext file. The file should \ Get the DOIs of the papers cited in a plaintext file. The file should \
have one citation per line. have one citation per line.
@ -66,29 +66,29 @@ def get_cited_DOIs(file):
# Try to get the DOI directly from the citation # Try to get the DOI directly from the citation
for citation in plaintext_citations[:]: for citation in plaintext_citations[:]:
# Some citations already contain a DOI so try to match it directly # Some citations already contain a DOI so try to match it directly
matched_DOIs = doi.extract_from_text(citation) matched_dois = doi.extract_from_text(citation)
if len(matched_DOIs) > 0: if len(matched_dois) > 0:
# Add the DOI and go on # Add the DOI and go on
dois[citation] = next(iter(matched_DOIs)) dois[citation] = next(iter(matched_dois))
continue continue
# Same thing for arXiv id # Same thing for arXiv id
matched_arXiv = arxiv.extract_from_text(citation) matched_arxiv = arxiv.extract_from_text(citation)
if len(matched_arXiv) > 0: if len(matched_arxiv) > 0:
# Add the associated DOI and go on # Add the associated DOI and go on
dois[citation] = arxiv.to_DOI(next(iter(matched_arXiv))) dois[citation] = arxiv.to_doi(next(iter(matched_arxiv)))
continue continue
# If no match found, stack it for next step # If no match found, stack it for next step
# Note to remove URLs in the citation as the plaintext citations can # Note to remove URLs in the citation as the plaintext citations can
# contain URLs and they are bad for the CrossRef API. # contain URLs and they are bad for the CrossRef API.
crossref_queue.append(tools.remove_URLs(citation)) crossref_queue.append(tools.remove_urls(citation))
# Do batch with remaining papers, to prevent from the timeout of CrossRef # Do batch with remaining papers, to prevent from the timeout of CrossRef
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE): for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
batch = [i for i in batch] batch = [i for i in batch]
try: try:
# Fetch results from CrossRef # Fetch results from CrossRef
r = requests.post(CROSSREF_LINKS_API_URL, json=batch) request = requests.post(CROSSREF_LINKS_API_URL, json=batch)
for result in r.json()["results"]: for result in request.json()["results"]:
# Try to get a DOI # Try to get a DOI
try: try:
dois[result["text"]] = result["doi"] dois[result["text"]] = result["doi"]

View File

@ -55,7 +55,7 @@ def is_valid(doi):
False False
""" """
match = REGEX.match(doi) match = REGEX.match(doi)
return ((match is not None) and (match.group(0) == doi)) return (match is not None) and (match.group(0) == doi)
def extract_from_text(text): def extract_from_text(text):
@ -71,17 +71,17 @@ def extract_from_text(text):
return tools.remove_duplicates(REGEX.findall(text)) return tools.remove_duplicates(REGEX.findall(text))
def to_URL(dois): def to_url(dois):
""" """
Convert a list of canonical DOIs to a list of DOIs URLs. Convert a list of canonical DOIs to a list of DOIs URLs.
:param dois: List of canonical DOIs. Can also be a single canonical DOI. :param dois: List of canonical DOIs. Can also be a single canonical DOI.
:returns: A list of DOIs URLs (resp. a single value). :returns: A list of DOIs URLs (resp. a single value).
>>> to_URL(['10.1209/0295-5075/111/40005']) >>> to_url(['10.1209/0295-5075/111/40005'])
['http://dx.doi.org/10.1209/0295-5075/111/40005'] ['http://dx.doi.org/10.1209/0295-5075/111/40005']
>>> to_URL('10.1209/0295-5075/111/40005') >>> to_url('10.1209/0295-5075/111/40005')
'http://dx.doi.org/10.1209/0295-5075/111/40005' 'http://dx.doi.org/10.1209/0295-5075/111/40005'
""" """
if isinstance(dois, list): if isinstance(dois, list):
@ -110,13 +110,7 @@ def to_canonical(urls):
>>> to_canonical(['aaaa']) is None >>> to_canonical(['aaaa']) is None
True True
""" """
try: return tools.map_or_apply(extract_from_text, urls)
if isinstance(urls, list):
return [next(iter(extract_from_text(url))) for url in urls]
else:
return next(iter(extract_from_text(urls)))
except StopIteration:
return None
def get_oa_version(doi): def get_oa_version(doi):
@ -134,10 +128,10 @@ def get_oa_version(doi):
'http://arxiv.org/abs/1506.06690' 'http://arxiv.org/abs/1506.06690'
""" """
try: try:
r = requests.get("%s%s" % (DISSEMIN_API, doi)) request = requests.get("%s%s" % (DISSEMIN_API, doi))
r.raise_for_status() request.raise_for_status()
result = r.json() result = request.json()
assert(result["status"] == "ok") assert result["status"] == "ok"
return result["paper"]["pdf_url"] return result["paper"]["pdf_url"]
except (AssertionError, ValueError, KeyError, RequestException): except (AssertionError, ValueError, KeyError, RequestException):
return None return None
@ -162,10 +156,10 @@ def get_oa_policy(doi):
True True
""" """
try: try:
r = requests.get("%s%s" % (DISSEMIN_API, doi)) request = requests.get("%s%s" % (DISSEMIN_API, doi))
r.raise_for_status() request.raise_for_status()
result = r.json() result = request.json()
assert(result["status"] == "ok") assert result["status"] == "ok"
return ([i return ([i
for i in result["paper"]["publications"] for i in result["paper"]["publications"]
if i["doi"] == doi][0])["policy"] if i["doi"] == doi][0])["policy"]
@ -185,8 +179,8 @@ def get_linked_version(doi):
'http://stacks.iop.org/0295-5075/111/i=4/a=40005?key=crossref.9ad851948a976ecdf216d4929b0b6f01' 'http://stacks.iop.org/0295-5075/111/i=4/a=40005?key=crossref.9ad851948a976ecdf216d4929b0b6f01'
""" """
try: try:
r = requests.head(to_URL(doi)) request = requests.head(to_url(doi))
return r.headers.get("location") return request.headers.get("location")
except RequestException: except RequestException:
return None return None
@ -206,10 +200,10 @@ def get_bibtex(doi):
'@article{Verney_2015,\\n\\tdoi = {10.1209/0295-5075/111/40005},\\n\\turl = {http://dx.doi.org/10.1209/0295-5075/111/40005},\\n\\tyear = 2015,\\n\\tmonth = {aug},\\n\\tpublisher = {{IOP} Publishing},\\n\\tvolume = {111},\\n\\tnumber = {4},\\n\\tpages = {40005},\\n\\tauthor = {Lucas Verney and Lev Pitaevskii and Sandro Stringari},\\n\\ttitle = {Hybridization of first and second sound in a weakly interacting Bose gas},\\n\\tjournal = {{EPL}}\\n}' '@article{Verney_2015,\\n\\tdoi = {10.1209/0295-5075/111/40005},\\n\\turl = {http://dx.doi.org/10.1209/0295-5075/111/40005},\\n\\tyear = 2015,\\n\\tmonth = {aug},\\n\\tpublisher = {{IOP} Publishing},\\n\\tvolume = {111},\\n\\tnumber = {4},\\n\\tpages = {40005},\\n\\tauthor = {Lucas Verney and Lev Pitaevskii and Sandro Stringari},\\n\\ttitle = {Hybridization of first and second sound in a weakly interacting Bose gas},\\n\\tjournal = {{EPL}}\\n}'
""" """
try: try:
r = requests.get(to_URL(doi), request = requests.get(to_url(doi),
headers={"accept": "application/x-bibtex"}) headers={"accept": "application/x-bibtex"})
r.raise_for_status() request.raise_for_status()
assert(r.headers.get("content-type") == "application/x-bibtex") assert request.headers.get("content-type") == "application/x-bibtex"
return r.text return request.text
except (RequestException, AssertionError): except (RequestException, AssertionError):
return None return None

View File

@ -3,24 +3,89 @@ This file contains functions to download locally some papers, eventually using
a proxy. a proxy.
""" """
import socket import socket
import socks
import sys import sys
import urllib import urllib
import socks
# Default socket to use, if no proxy is used # Default socket to use, if no proxy is used
DEFAULT_SOCKET = socket.socket DEFAULT_SOCKET = socket.socket
def download(url, proxies=[None]): def _download_helper(url):
"""
Handle the download of an URL, using the proxy currently set in \
:mod:`socks`.
:param url: The URL to download.
:returns: A tuple of the raw content of the downloaded data and its \
associated content-type. Returns None if it was \
unable to download the document.
"""
# Try to fetch the URL using the current proxy
try:
request = urllib.request.urlopen(url)
try:
size = int(dict(request.info())['content-length'].strip())
except KeyError:
try:
size = int(dict(request.info())['Content-Length'].strip())
except KeyError:
size = 0
# Download the document
doc = b""
doc_size = 0
while True:
buf = request.read(1024)
if buf:
doc += buf
doc_size += len(buf)
if size != 0:
# Write progress bar on stdout
done = int(50 * doc_size / size)
sys.stdout.write("\r[%s%s]" %
('='*done, ' '*(50-done)))
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
sys.stdout.flush()
else:
break
# Fetch content type
contenttype = None
contenttype_req = None
try:
contenttype_req = dict(request.info())['content-type']
except KeyError:
try:
contenttype_req = dict(request.info())['Content-Type']
except KeyError:
return None
if 'pdf' in contenttype_req:
contenttype = 'pdf'
elif 'djvu' in contenttype_req:
contenttype = 'djvu'
# Check content type and status code are ok
if request.getcode() != 200 or contenttype is None:
# Else, try with the next available proxy
return None
# Return a tuple of the downloaded content and the content-type
return (doc, contenttype)
# If an exception occurred, continue with next available proxy
except (urllib.error.URLError, socket.error, ValueError):
return None
def download(url, proxies=None):
""" """
Download a PDF or DJVU document from a url, eventually using proxies. Download a PDF or DJVU document from a url, eventually using proxies.
:params url: The URL to the PDF/DJVU document to fetch. :params url: The URL to the PDF/DJVU document to fetch.
:params proxies: An optional list of proxies to use. Proxies will be \ :params proxies: An optional list of proxies to use. Proxies will be \
used sequentially. Proxies should be a list of proxy strings. \ used sequentially. Proxies should be a list of proxy strings. \
Do not forget to include ``None`` in the list if you want to try \ Do not forget to include ``""`` (empty string) in the list if \
direct fetching without any proxy. you want to try direct fetching without any proxy.
:returns: A tuple of the raw content of the downloaded data and its \ :returns: A tuple of the raw content of the downloaded data and its \
associated content-type. Returns ``(None, None)`` if it was \ associated content-type. Returns ``(None, None)`` if it was \
@ -28,10 +93,14 @@ def download(url, proxies=[None]):
>>> download("http://arxiv.org/pdf/1312.4006.pdf") # doctest: +SKIP >>> download("http://arxiv.org/pdf/1312.4006.pdf") # doctest: +SKIP
""" """
# Handle default argument
if proxies is None:
proxies = [""]
# Loop over all available connections # Loop over all available connections
for proxy in proxies: for proxy in proxies:
# Handle no proxy case # Handle no proxy case
if proxy is None: if proxy == "":
socket.socket = DEFAULT_SOCKET socket.socket = DEFAULT_SOCKET
# Handle SOCKS proxy # Handle SOCKS proxy
elif proxy.startswith('socks'): elif proxy.startswith('socks'):
@ -55,58 +124,9 @@ def download(url, proxies=[None]):
socks.set_default_proxy(socks.HTTP, proxy, port) socks.set_default_proxy(socks.HTTP, proxy, port)
socket.socket = socks.socksocket socket.socket = socks.socksocket
# Try to fetch the URL using the current proxy downloaded = _download_helper(url)
try: if downloaded is not None:
r = urllib.request.urlopen(url) return downloaded
try:
size = int(dict(r.info())['content-length'].strip())
except KeyError:
try:
size = int(dict(r.info())['Content-Length'].strip())
except KeyError:
size = 0
# Download the document
dl = b""
dl_size = 0
while True:
buf = r.read(1024)
if buf:
dl += buf
dl_size += len(buf)
if size != 0:
# Write progress bar on stdout
done = int(50 * dl_size / size)
sys.stdout.write("\r[%s%s]" %
('='*done, ' '*(50-done)))
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
sys.stdout.flush()
else:
break
# Fetch content type
contenttype = False
contenttype_req = None
try:
contenttype_req = dict(r.info())['content-type']
except KeyError:
try:
contenttype_req = dict(r.info())['Content-Type']
except KeyError:
continue
if 'pdf' in contenttype_req:
contenttype = 'pdf'
elif 'djvu' in contenttype_req:
contenttype = 'djvu'
# Check content type and status code are ok
if r.getcode() != 200 or contenttype is False:
# Else, try with the next available proxy
continue
# Return a tuple of the downloaded content and the content-type
return (dl, contenttype)
# If an exception occurred, continue with next available proxy
except (urllib.error.URLError, socket.error, ValueError):
continue
# In case of running out of proxies, return (None, None) # In case of running out of proxies, return (None, None)
return (None, None) return (None, None)

View File

@ -11,11 +11,11 @@ from libbmc import __valid_identifiers__
__valid_identifiers__ += ["isbn"] __valid_identifiers__ += ["isbn"]
def is_valid(isbn): def is_valid(isbn_id):
""" """
Check that a given string is a valid ISBN. Check that a given string is a valid ISBN.
:param isbn: the isbn to be checked. :param isbn_id: the isbn to be checked.
:returns: boolean indicating whether the isbn is valid or not. :returns: boolean indicating whether the isbn is valid or not.
>>> is_valid("978-3-16-148410-0") >>> is_valid("978-3-16-148410-0")
@ -43,9 +43,9 @@ def is_valid(isbn):
True True
""" """
return ( return (
(not isbnlib.notisbn(isbn)) and ( (not isbnlib.notisbn(isbn_id)) and (
isbnlib.get_canonical_isbn(isbn) == isbn or isbnlib.get_canonical_isbn(isbn_id) == isbn_id or
isbnlib.mask(isbnlib.get_canonical_isbn(isbn)) == isbn) isbnlib.mask(isbnlib.get_canonical_isbn(isbn_id)) == isbn_id)
) )
@ -64,28 +64,28 @@ def extract_from_text(text):
return [i for i in isbns if i is not None] return [i for i in isbns if i is not None]
def get_bibtex(isbn): def get_bibtex(isbn_identifier):
""" """
Get a BibTeX string for the given ISBN. Get a BibTeX string for the given ISBN.
:param isbn: ISBN to fetch BibTeX entry for. :param isbn_identifier: ISBN to fetch BibTeX entry for.
:returns: A BibTeX string or ``None`` if could not fetch it. :returns: A BibTeX string or ``None`` if could not fetch it.
>>> get_bibtex('9783161484100') >>> get_bibtex('9783161484100')
'@book{9783161484100,\\n title = {Berkeley, Oakland: Albany, Emeryville, Alameda, Kensington},\\n author = {Peekaboo Maps},\\n isbn = {9783161484100},\\n year = {2009},\\n publisher = {Peek A Boo Maps}\\n}' '@book{9783161484100,\\n title = {Berkeley, Oakland: Albany, Emeryville, Alameda, Kensington},\\n author = {Peekaboo Maps},\\n isbn = {9783161484100},\\n year = {2009},\\n publisher = {Peek A Boo Maps}\\n}'
""" """
# Try to find the BibTeX using associated DOIs # Try to find the BibTeX using associated DOIs
bibtex = doi.get_bibtex(to_DOI(isbn)) bibtex = doi.get_bibtex(to_doi(isbn_identifier))
if bibtex is None: if bibtex is None:
# In some cases, there are no DOIs for a given ISBN. In this case, try # In some cases, there are no DOIs for a given ISBN. In this case, try
# to fetch bibtex directly from the ISBN, using a combination of # to fetch bibtex directly from the ISBN, using a combination of
# Google Books and worldcat.org results. # Google Books and worldcat.org results.
bibtex = isbnlib.registry.bibformatters['bibtex']( bibtex = isbnlib.registry.bibformatters['bibtex'](
isbnlib.meta(isbn, 'default')) isbnlib.meta(isbn_identifier, 'default'))
return bibtex return bibtex
def to_DOI(isbn): def to_doi(isbn_identifier):
""" """
Make a DOI out of the given ISBN. Make a DOI out of the given ISBN.
@ -94,16 +94,16 @@ def to_DOI(isbn):
See https://github.com/xlcnd/isbnlib#note. The returned DOI may not be See https://github.com/xlcnd/isbnlib#note. The returned DOI may not be
issued yet. issued yet.
:param isbn: A valid ISBN string. :param isbn_identifier: A valid ISBN string.
:returns: A DOI as string. :returns: A DOI as string.
>>> to_DOI('9783161484100') >>> to_doi('9783161484100')
'10.978.316/1484100' '10.978.316/1484100'
""" """
return isbnlib.doi(isbn) return isbnlib.doi(isbn_identifier)
def from_DOI(doi): def from_doi(doi_identifier):
""" """
Make an ISBN out of the given DOI. Make an ISBN out of the given DOI.
@ -119,10 +119,10 @@ def from_DOI(doi):
issued yet (it is a valid one, but not necessary corresponding to a issued yet (it is a valid one, but not necessary corresponding to a
valid book). valid book).
:param doi: A valid canonical DOI. :param doi_identifier: A valid canonical DOI.
:returns: An ISBN string. :returns: An ISBN string.
>>> from_DOI('10.978.316/1484100') >>> from_doi('10.978.316/1484100')
'9783161484100' '9783161484100'
""" """
return "".join(c for c in doi[2:] if c in "0123456789xX") return "".join(c for c in doi_identifier[2:] if c in "0123456789xX")

View File

@ -1,6 +0,0 @@
from libbmc.papers import identifiers
__all__ = [
"identifiers",
"tearpages"
]

View File

@ -13,8 +13,8 @@ import sys
from libbmc import __valid_identifiers__ from libbmc import __valid_identifiers__
# Import all the modules associated to __valid_identifiers__ # Import all the modules associated to __valid_identifiers__
for type in __valid_identifiers__: for valid_identifier in __valid_identifiers__:
importlib.import_module("libbmc.%s" % (type,)) importlib.import_module("libbmc.%s" % (valid_identifier,))
def find_identifiers(src): def find_identifiers(src):
@ -53,18 +53,19 @@ def find_identifiers(src):
while totext.poll() is None: while totext.poll() is None:
extract_full = ' '.join([i.decode("utf-8").strip() extract_full = ' '.join([i.decode("utf-8").strip()
for i in totext.stdout.readlines()]) for i in totext.stdout.readlines()])
# Loop over all the valid identifier types # Loop over all the valid identifier types
for type in __valid_identifiers__: for identifier in __valid_identifiers__:
# Dynamically call the ``extract_from_text`` method for the # Dynamically call the ``extract_from_text`` method for the
# associated module. # associated module.
m = sys.modules.get("libbmc.%s" % (type,), None) module = sys.modules.get("libbmc.%s" % (identifier,), None)
if m is None: if module is None:
continue continue
found_id = getattr(m, "extract_from_text")(extract_full) found_id = getattr(module, "extract_from_text")(extract_full)
if found_id: if found_id:
totext.terminate() totext.terminate()
return (type, found_id[0]) # found_id is a list of found IDs # found_id is a list of found IDs
return (identifier, found_id[0])
return (None, None) return (None, None)
@ -80,12 +81,12 @@ def get_bibtex(identifier):
:returns: A BibTeX string or ``None`` if an error occurred. :returns: A BibTeX string or ``None`` if an error occurred.
# TODO: Should return a BiBTeX object? # TODO: Should return a BiBTeX object?
""" """
type, id = identifier identifier_type, identifier_id = identifier
if type not in __valid_identifiers__: if identifier_type not in __valid_identifiers__:
return None return None
# Dynamically call the ``get_bibtex`` method from the associated module. # Dynamically call the ``get_bibtex`` method from the associated module.
m = sys.modules.get("libbmc.%s" % (type,), None) module = sys.modules.get("libbmc.%s" % (identifier_type,), None)
if m is None: if module is None:
return None return None
return getattr(m, "get_bibtex")(id) return getattr(module, "get_bibtex")(identifier_id)

View File

@ -21,7 +21,7 @@ BAD_JOURNALS = {
} }
def fixPdf(pdfFile, destination): def fix_pdf(pdf_file, destination):
""" """
Fix malformed pdf files when data are present after '%%EOF' Fix malformed pdf files when data are present after '%%EOF'
@ -33,18 +33,16 @@ def fixPdf(pdfFile, destination):
:param destination: destination :param destination: destination
""" """
tmp = tempfile.NamedTemporaryFile() tmp = tempfile.NamedTemporaryFile()
output = open(tmp.name, 'wb') with open(tmp.name, 'wb') as output:
with open(pdfFile, "rb") as fh: with open(pdf_file, "rb") as fh:
with open(pdfFile, "rb") as fh:
for line in fh: for line in fh:
output.write(line) output.write(line)
if b'%%EOF' in line: if b'%%EOF' in line:
break break
output.close()
shutil.copy(tmp.name, destination) shutil.copy(tmp.name, destination)
def tearpage_backend(filename, teared_pages=[0]): def tearpage_backend(filename, teared_pages=None):
""" """
Copy filename to a tempfile, write pages to filename except the teared one. Copy filename to a tempfile, write pages to filename except the teared one.
@ -56,29 +54,35 @@ def tearpage_backend(filename, teared_pages=[0]):
:param teared_pages: Numbers of the pages to tear. Default to first page \ :param teared_pages: Numbers of the pages to tear. Default to first page \
only. only.
""" """
# Handle default argument
if teared_pages is None:
teared_pages = [0]
# Copy the pdf to a tmp file # Copy the pdf to a tmp file
tmp = tempfile.NamedTemporaryFile() with tempfile.NamedTemporaryFile() as tmp:
shutil.copy(filename, tmp.name) # Copy the input file to tmp
shutil.copy(filename, tmp.name)
# Read the copied pdf # Read the copied pdf
try: # TODO: Use with syntax
input_file = PdfFileReader(open(tmp.name, 'rb')) try:
except PdfReadError: input_file = PdfFileReader(open(tmp.name, 'rb'))
fixPdf(filename, tmp.name) except PdfReadError:
input_file = PdfFileReader(open(tmp.name, 'rb')) fix_pdf(filename, tmp.name)
# Seek for the number of pages input_file = PdfFileReader(open(tmp.name, 'rb'))
num_pages = input_file.getNumPages() # Seek for the number of pages
num_pages = input_file.getNumPages()
# Write pages excepted the first one # Write pages excepted the first one
output_file = PdfFileWriter() output_file = PdfFileWriter()
for i in range(num_pages): for i in range(num_pages):
if i in teared_pages: if i in teared_pages:
continue continue
output_file.addPage(input_file.getPage(i)) output_file.addPage(input_file.getPage(i))
tmp.close() tmp.close()
outputStream = open(filename, "wb") outputStream = open(filename, "wb")
output_file.write(outputStream) output_file.write(outputStream)
def tearpage_needed(bibtex): def tearpage_needed(bibtex):
@ -89,16 +93,16 @@ def tearpage_needed(bibtex):
whether tearing is needed. whether tearing is needed.
:returns: A list of pages to tear. :returns: A list of pages to tear.
""" """
for p in BAD_JOURNALS: for publisher in BAD_JOURNALS:
if p in bibtex.get("journal", "").lower(): if publisher in bibtex.get("journal", "").lower():
# Bad journal is found, add pages to tear # Bad journal is found, add pages to tear
return BAD_JOURNALS[p] return BAD_JOURNALS[publisher]
# If no bad journals are found, return an empty list # If no bad journals are found, return an empty list
return [] return []
def tearpage(filename, bibtex=None, force=False): def tearpage(filename, bibtex=None, force=None):
""" """
Tear some pages of the file if needed. Tear some pages of the file if needed.
@ -112,7 +116,7 @@ def tearpage(filename, bibtex=None, force=False):
""" """
# Fetch pages to tear # Fetch pages to tear
pages_to_tear = [] pages_to_tear = []
if force is not False: if force is not None:
pages_to_tear = force pages_to_tear = force
elif bibtex is not None: elif bibtex is not None:
pages_to_tear = tearpage_needed(bibtex) pages_to_tear = tearpage_needed(bibtex)

View File

@ -1,5 +0,0 @@
from libbmc.repositories import arxiv, hal
__all__ = [
"arxiv", "hal"
]

View File

@ -1,15 +1,18 @@
""" """
This file contains all the arXiv-related functions. This file contains all the arXiv-related functions.
""" """
import arxiv2bib
import bibtexparser
import io import io
import re import re
import requests
import tarfile import tarfile
import xml.etree.ElementTree import xml.etree.ElementTree
from urllib.error import HTTPError from urllib.error import HTTPError
import arxiv2bib
import bibtexparser
import requests
from requests.exceptions import RequestException from requests.exceptions import RequestException
@ -268,7 +271,7 @@ def is_valid(arxiv_id):
False False
""" """
match = REGEX.match(arxiv_id) match = REGEX.match(arxiv_id)
return ((match is not None) and (match.group(0) == arxiv_id)) return (match is not None) and (match.group(0) == arxiv_id)
def get_bibtex(arxiv_id): def get_bibtex(arxiv_id):
@ -320,17 +323,17 @@ def extract_from_text(text):
for i in REGEX.findall(text) if i[0] != '']) for i in REGEX.findall(text) if i[0] != ''])
def to_URL(arxiv_ids): def to_url(arxiv_ids):
""" """
Convert a list of canonical DOIs to a list of DOIs URLs. Convert a list of canonical DOIs to a list of DOIs URLs.
:param dois: List of canonical DOIs. :param dois: List of canonical DOIs.
:returns: A list of DOIs URLs. :returns: A list of DOIs URLs.
>>> to_URL('1506.06690') >>> to_url('1506.06690')
'http://arxiv.org/abs/1506.06690' 'http://arxiv.org/abs/1506.06690'
>>> to_URL('1506.06690v1') >>> to_url('1506.06690v1')
'http://arxiv.org/abs/1506.06690v1' 'http://arxiv.org/abs/1506.06690v1'
""" """
if isinstance(arxiv_ids, list): if isinstance(arxiv_ids, list):
@ -358,16 +361,10 @@ def to_canonical(urls):
>>> to_canonical('aaa') is None >>> to_canonical('aaa') is None
True True
""" """
try: return tools.map_or_apply(extract_from_text, urls)
if isinstance(urls, list):
return [next(iter(extract_from_text(url))) for url in urls]
else:
return next(iter(extract_from_text(urls)))
except StopIteration:
return None
def from_DOI(doi): def from_doi(doi):
""" """
Get the arXiv eprint id for a given DOI. Get the arXiv eprint id for a given DOI.
@ -379,29 +376,29 @@ def from_DOI(doi):
:param doi: The DOI of the resource to look for. :param doi: The DOI of the resource to look for.
:returns: The arXiv eprint id, or ``None`` if not found. :returns: The arXiv eprint id, or ``None`` if not found.
>>> from_DOI('10.1209/0295-5075/111/40005') >>> from_doi('10.1209/0295-5075/111/40005')
# Note: Test do not pass due to an arXiv API bug. # Note: Test do not pass due to an arXiv API bug.
'1506.06690' '1506.06690'
""" """
try: try:
r = requests.get("http://export.arxiv.org/api/query", request = requests.get("http://export.arxiv.org/api/query",
params={ params={
"search_query": "doi:%s" % (doi,), "search_query": "doi:%s" % (doi,),
"max_results": 1 "max_results": 1
}) })
r.raise_for_status() request.raise_for_status()
except RequestException: except RequestException:
return None return None
e = xml.etree.ElementTree.fromstring(r.content) root = xml.etree.ElementTree.fromstring(request.content)
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"): for entry in root.iter("{http://www.w3.org/2005/Atom}entry"):
id = entry.find("{http://www.w3.org/2005/Atom}id").text arxiv_id = entry.find("{http://www.w3.org/2005/Atom}id").text
# id is an arXiv full URL. We only want the id which is the last URL # arxiv_id is an arXiv full URL. We only want the id which is the last
# component. # URL component.
return id.split("/")[-1] return arxiv_id.split("/")[-1]
return None return None
def to_DOI(arxiv_id): def to_doi(arxiv_id):
""" """
Get the associated DOI for a given arXiv eprint. Get the associated DOI for a given arXiv eprint.
@ -413,23 +410,23 @@ def to_DOI(arxiv_id):
:param eprint: The arXiv eprint id. :param eprint: The arXiv eprint id.
:returns: The DOI if any, or ``None``. :returns: The DOI if any, or ``None``.
>>> to_DOI('1506.06690v1') >>> to_doi('1506.06690v1')
'10.1209/0295-5075/111/40005' '10.1209/0295-5075/111/40005'
>>> to_DOI('1506.06690') >>> to_doi('1506.06690')
'10.1209/0295-5075/111/40005' '10.1209/0295-5075/111/40005'
""" """
try: try:
r = requests.get("http://export.arxiv.org/api/query", request = requests.get("http://export.arxiv.org/api/query",
params={ params={
"id_list": arxiv_id, "id_list": arxiv_id,
"max_results": 1 "max_results": 1
}) })
r.raise_for_status() request.raise_for_status()
except RequestException: except RequestException:
return None return None
e = xml.etree.ElementTree.fromstring(r.content) root = xml.etree.ElementTree.fromstring(request.content)
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"): for entry in root.iter("{http://www.w3.org/2005/Atom}entry"):
doi = entry.find("{http://arxiv.org/schemas/atom}doi") doi = entry.find("{http://arxiv.org/schemas/atom}doi")
if doi is not None: if doi is not None:
return doi.text return doi.text
@ -451,9 +448,9 @@ def get_sources(arxiv_id):
``None``. ``None``.
""" """
try: try:
r = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id)) request = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))
r.raise_for_status() request.raise_for_status()
file_object = io.BytesIO(r.content) file_object = io.BytesIO(request.content)
return tarfile.open(fileobj=file_object) return tarfile.open(fileobj=file_object)
except (RequestException, AssertionError, tarfile.TarError): except (RequestException, AssertionError, tarfile.TarError):
return None return None
@ -473,9 +470,9 @@ def get_bbl(arxiv_id):
:returns: A list of the full text of the ``.bbl`` files (if any) \ :returns: A list of the full text of the ``.bbl`` files (if any) \
or ``None``. or ``None``.
""" """
tf = get_sources(arxiv_id) tar_file = get_sources(arxiv_id)
bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")] bbl_files = [i for i in tar_file.getmembers() if i.name.endswith(".bbl")]
bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING) bbl_files = [tar_file.extractfile(member).read().decode(tarfile.ENCODING)
for member in bbl_files] for member in bbl_files]
return bbl_files return bbl_files
@ -498,5 +495,5 @@ def get_citations(arxiv_id):
bbl_files = get_bbl(arxiv_id) bbl_files = get_bbl(arxiv_id)
for bbl_file in bbl_files: for bbl_file in bbl_files:
# Fetch the cited DOIs for each of the bbl files # Fetch the cited DOIs for each of the bbl files
dois.update(bbl.get_cited_DOIs(bbl_file)) dois.update(bbl.get_cited_dois(bbl_file))
return dois return dois

View File

@ -33,7 +33,7 @@ def is_valid(hal_id):
False False
""" """
match = REGEX.match(hal_id) match = REGEX.match(hal_id)
return ((match is not None) and (match.group(0) == hal_id)) return (match is not None) and (match.group(0) == hal_id)
def extract_from_text(text): def extract_from_text(text):

View File

@ -9,9 +9,11 @@ from itertools import islice, chain
# Huge URL regex taken from https://gist.github.com/gruber/8891611 # Huge URL regex taken from https://gist.github.com/gruber/8891611
URL_REGEX = re.compile(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))") URL_REGEX = re.compile(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))")
_SLUGIFY_STRIP_RE = re.compile(r'[^\w\s-]')
_SLUGIFY_HYPHENATE_RE = re.compile(r'[\s]+')
def replaceAll(text, replace_dict): def replace_all(text, replace_dict):
""" """
Replace multiple strings in a text. Replace multiple strings in a text.
@ -26,7 +28,7 @@ def replaceAll(text, replace_dict):
substitution. substitution.
:returns: Text after replacements. :returns: Text after replacements.
>>> replaceAll("foo bar foo thing", {"foo": "oof", "bar": "rab"}) >>> replace_all("foo bar foo thing", {"foo": "oof", "bar": "rab"})
'oof rab oof thing' 'oof rab oof thing'
""" """
for i, j in replace_dict.items(): for i, j in replace_dict.items():
@ -34,6 +36,24 @@ def replaceAll(text, replace_dict):
return text return text
def map_or_apply(function, param):
"""
Map the function on ``param``, or apply it, depending whether ``param`` \
is a list or an item.
:param function: The function to apply.
:param param: The parameter to feed the function with (list or item).
:returns: The computed value or ``None``.
"""
try:
if isinstance(param, list):
return [next(iter(function(i))) for i in param]
else:
return next(iter(function(param)))
except StopIteration:
return None
def clean_whitespaces(text): def clean_whitespaces(text):
""" """
Remove multiple whitespaces from text. Also removes leading and trailing \ Remove multiple whitespaces from text. Also removes leading and trailing \
@ -85,13 +105,13 @@ def batch(iterable, size):
>>> [list(i) for i in batch([1, 2, 3, 4, 5], 2)] >>> [list(i) for i in batch([1, 2, 3, 4, 5], 2)]
[[1, 2], [3, 4], [5]] [[1, 2], [3, 4], [5]]
""" """
it = iter(iterable) item = iter(iterable)
while True: while True:
bi = islice(it, size) batch_iterator = islice(item, size)
yield chain([next(bi)], bi) yield chain([next(batch_iterator)], batch_iterator)
def remove_URLs(text): def remove_urls(text):
""" """
Remove URLs from a given text (only removes http, https and naked domains \ Remove URLs from a given text (only removes http, https and naked domains \
URLs). URLs).
@ -99,16 +119,12 @@ def remove_URLs(text):
:param text: The text to remove URLs from. :param text: The text to remove URLs from.
:returns: The text without URLs. :returns: The text without URLs.
>>> remove_URLs("foobar http://example.com https://example.com foobar") >>> remove_urls("foobar http://example.com https://example.com foobar")
'foobar foobar' 'foobar foobar'
""" """
return clean_whitespaces(URL_REGEX.sub("", text)) return clean_whitespaces(URL_REGEX.sub("", text))
_slugify_strip_re = re.compile(r'[^\w\s-]')
_slugify_hyphenate_re = re.compile(r'[\s]+')
def slugify(value): def slugify(value):
""" """
Normalizes string, converts to lowercase, removes non-alpha characters, Normalizes string, converts to lowercase, removes non-alpha characters,
@ -127,5 +143,5 @@ def slugify(value):
value = unicode_type(value) value = unicode_type(value)
value = (unicodedata.normalize('NFKD', value). value = (unicodedata.normalize('NFKD', value).
encode('ascii', 'ignore').decode('ascii')) encode('ascii', 'ignore').decode('ascii'))
value = unicode_type(_slugify_strip_re.sub('', value).strip()) value = unicode_type(_SLUGIFY_STRIP_RE.sub('', value).strip())
return _slugify_hyphenate_re.sub('_', value) return _SLUGIFY_HYPHENATE_RE.sub('_', value)