Passing pylint on the module
This commit is contained in:
parent
a69e7ef6c1
commit
a2ee654eac
@ -1,15 +1,11 @@
|
||||
"""
|
||||
libbmc
|
||||
|
||||
The :mod:`libbmc` is a generic Python library to manage bibliography and play
|
||||
with scientific papers.
|
||||
"""
|
||||
|
||||
# Global list of valid paper identifier types. See README.md.
|
||||
__valid_identifiers__ = []
|
||||
|
||||
# Import order of the modules is important, as they will populate
|
||||
# `__valid_identifiers__` on load, and the order in this list reflects their
|
||||
# priority.
|
||||
from libbmc import bibtex, doi, fetcher, isbn # noqa
|
||||
from libbmc import citations, papers, repositories # noqa
|
||||
|
||||
__version__ = "0.1.3.1"
|
||||
|
||||
__all__ = [
|
||||
"bibtex", "doi", "fetcher", "isbn",
|
||||
"citations", "papers", "repositories",
|
||||
]
|
||||
|
@ -1,5 +0,0 @@
|
||||
from libbmc.citations import bbl, bibtex, pdf, plaintext
|
||||
|
||||
__all__ = [
|
||||
"bbl", "bibtex", "pdf", "plaintext"
|
||||
]
|
@ -73,7 +73,7 @@ def get_plaintext_citations(bbl):
|
||||
return cleaned_bbl
|
||||
|
||||
|
||||
def get_cited_DOIs(bbl):
|
||||
def get_cited_dois(bbl):
|
||||
"""
|
||||
Get the DOIs of the papers cited in a .bbl file.
|
||||
|
||||
@ -85,4 +85,4 @@ def get_cited_DOIs(bbl):
|
||||
# Get the plaintext citations from the bbl file
|
||||
plaintext_citations = get_plaintext_citations(bbl)
|
||||
# Use the plaintext citations parser on these citations
|
||||
return plaintext.get_cited_DOIs(plaintext_citations)
|
||||
return plaintext.get_cited_dois(plaintext_citations)
|
||||
|
@ -2,15 +2,20 @@
|
||||
This files contains all the functions to extract DOIs of citations from
|
||||
BibTeX files.
|
||||
"""
|
||||
import bibtexparser
|
||||
import os
|
||||
|
||||
|
||||
import bibtexparser
|
||||
|
||||
from bibtexparser.bparser import BibTexParser
|
||||
from bibtexparser.customization import convert_to_unicode
|
||||
|
||||
|
||||
from libbmc import tools
|
||||
from libbmc.citations import plaintext
|
||||
|
||||
# TODO: Use beta.dissem.in with formatted citation
|
||||
|
||||
|
||||
def bibentry_as_plaintext(bibentry):
|
||||
"""
|
||||
@ -51,7 +56,7 @@ def get_plaintext_citations(bibtex):
|
||||
return bibentries
|
||||
|
||||
|
||||
def get_cited_DOIs(bibtex):
|
||||
def get_cited_dois(bibtex):
|
||||
"""
|
||||
Get the DOIs of the papers cited in a BibTeX file.
|
||||
|
||||
@ -71,4 +76,4 @@ def get_cited_DOIs(bibtex):
|
||||
# Get the plaintext citations from the bibtex file
|
||||
plaintext_citations = get_plaintext_citations(bibtex)
|
||||
# Use the plaintext citations parser on these citations
|
||||
return plaintext.get_cited_DOIs(plaintext_citations)
|
||||
return plaintext.get_cited_dois(plaintext_citations)
|
||||
|
@ -3,10 +3,11 @@ This files contains all the functions to extract DOIs of citations from
|
||||
PDF files.
|
||||
"""
|
||||
import os
|
||||
import requests
|
||||
import subprocess
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
import requests
|
||||
|
||||
from requests.exceptions import RequestException
|
||||
|
||||
from libbmc import tools
|
||||
@ -17,7 +18,7 @@ CERMINE_BASE_URL = "http://cermine.ceon.pl/"
|
||||
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
||||
def cermine(pdf_file, force_API=False, override_local=None):
|
||||
def cermine(pdf_file, force_api=False, override_local=None):
|
||||
"""
|
||||
Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract metadata from \
|
||||
the given PDF file, to retrieve citations (and more) from the \
|
||||
@ -44,7 +45,7 @@ def cermine(pdf_file, force_API=False, override_local=None):
|
||||
the CERMINE API terms.
|
||||
|
||||
:param pdf_file: Path to the PDF file to handle.
|
||||
:param force_API: Force the use of the Cermine API \
|
||||
:param force_api: Force the use of the Cermine API \
|
||||
(and do not try to use a local JAR file). Defaults to ``False``.
|
||||
:param override_local: Use this specific JAR file, instead of the one at \
|
||||
the default location (``libbmc/external/cermine.jar``).
|
||||
@ -55,23 +56,23 @@ def cermine(pdf_file, force_API=False, override_local=None):
|
||||
# Check if we want to load the local JAR from a specific path
|
||||
local = override_local
|
||||
# Else, try to stat the JAR file at the expected local path
|
||||
if (local is None) and (not force_API):
|
||||
if (local is None) and (not force_api):
|
||||
if os.path.isfile(os.path.join(SCRIPT_DIR,
|
||||
"../external/cermine.jar")):
|
||||
local = os.path.join(SCRIPT_DIR,
|
||||
"../external/cermine.jar")
|
||||
|
||||
# If we want to force the API use, or we could not get a local JAR
|
||||
if force_API or (local is None):
|
||||
if force_api or (local is None):
|
||||
print("Using API")
|
||||
with open(pdf_file, "rb") as fh:
|
||||
# Query the API
|
||||
r = requests.post(
|
||||
CERMINE_BASE_URL + "extract.do",
|
||||
headers={"Content-Type": "application/binary"},
|
||||
files={"file": fh}
|
||||
)
|
||||
return r.text
|
||||
# Query the API
|
||||
request = requests.post(
|
||||
CERMINE_BASE_URL + "extract.do",
|
||||
headers={"Content-Type": "application/binary"},
|
||||
files={"file": fh}
|
||||
)
|
||||
return request.text
|
||||
# Else, use the local JAR file
|
||||
else:
|
||||
return subprocess.check_output([
|
||||
@ -86,7 +87,7 @@ def cermine(pdf_file, force_API=False, override_local=None):
|
||||
return None
|
||||
|
||||
|
||||
def cermine_dois(pdf_file, force_API=False, override_local=None):
|
||||
def cermine_dois(pdf_file, force_api=False, override_local=None):
|
||||
"""
|
||||
Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract DOIs of cited \
|
||||
papers from a PDF file.
|
||||
@ -116,7 +117,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
|
||||
try to match them on Crossref to get DOIs.
|
||||
|
||||
:param pdf_file: Path to the PDF file to handle.
|
||||
:param force_API: Force the use of the Cermine API \
|
||||
:param force_api: Force the use of the Cermine API \
|
||||
(and do not try to use a local JAR file). Defaults to ``False``.
|
||||
:param override_local: Use this specific JAR file, instead of the one at \
|
||||
the default location (``libbmc/external/cermine.jar``).
|
||||
@ -126,7 +127,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
|
||||
# * Do not convert to plain text, but use the extra metadata from
|
||||
# CERMINE
|
||||
# Call CERMINE on the PDF file
|
||||
cermine_output = cermine(pdf_file, force_API, override_local)
|
||||
cermine_output = cermine(pdf_file, force_api, override_local)
|
||||
# Parse the resulting XML
|
||||
root = ET.fromstring(cermine_output)
|
||||
plaintext_references = [
|
||||
@ -136,7 +137,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
|
||||
ET.tostring(e, method="text").decode("utf-8").replace(e.text, ""))
|
||||
for e in root.iter("mixed-citation")]
|
||||
# Call the plaintext methods to fetch DOIs
|
||||
return plaintext.get_cited_DOIs(plaintext_references)
|
||||
return plaintext.get_cited_dois(plaintext_references)
|
||||
|
||||
|
||||
def grobid(pdf_folder, grobid_home=None, grobid_jar=None):
|
||||
@ -156,6 +157,8 @@ def grobid(pdf_folder, grobid_home=None, grobid_jar=None):
|
||||
:param grobid_jar: Path to the built Grobid JAR file.
|
||||
:returns: ``True``, or ``False`` if an error occurred.
|
||||
"""
|
||||
# TODO: Should be using https://github.com/kermitt2/grobid-example and
|
||||
# BibTeX backend.
|
||||
if grobid_home is None or grobid_jar is None:
|
||||
# User should pass the correct paths
|
||||
return False
|
||||
@ -234,4 +237,4 @@ def pdfextract_dois(pdf_file):
|
||||
root = ET.fromstring(references)
|
||||
plaintext_references = [e.text for e in root.iter("reference")]
|
||||
# Call the plaintext methods to fetch DOIs
|
||||
return plaintext.get_cited_DOIs(plaintext_references)
|
||||
return plaintext.get_cited_dois(plaintext_references)
|
||||
|
@ -37,7 +37,7 @@ def get_plaintext_citations(file):
|
||||
return cleaned_citations
|
||||
|
||||
|
||||
def get_cited_DOIs(file):
|
||||
def get_cited_dois(file):
|
||||
"""
|
||||
Get the DOIs of the papers cited in a plaintext file. The file should \
|
||||
have one citation per line.
|
||||
@ -66,29 +66,29 @@ def get_cited_DOIs(file):
|
||||
# Try to get the DOI directly from the citation
|
||||
for citation in plaintext_citations[:]:
|
||||
# Some citations already contain a DOI so try to match it directly
|
||||
matched_DOIs = doi.extract_from_text(citation)
|
||||
if len(matched_DOIs) > 0:
|
||||
matched_dois = doi.extract_from_text(citation)
|
||||
if len(matched_dois) > 0:
|
||||
# Add the DOI and go on
|
||||
dois[citation] = next(iter(matched_DOIs))
|
||||
dois[citation] = next(iter(matched_dois))
|
||||
continue
|
||||
# Same thing for arXiv id
|
||||
matched_arXiv = arxiv.extract_from_text(citation)
|
||||
if len(matched_arXiv) > 0:
|
||||
matched_arxiv = arxiv.extract_from_text(citation)
|
||||
if len(matched_arxiv) > 0:
|
||||
# Add the associated DOI and go on
|
||||
dois[citation] = arxiv.to_DOI(next(iter(matched_arXiv)))
|
||||
dois[citation] = arxiv.to_doi(next(iter(matched_arxiv)))
|
||||
continue
|
||||
# If no match found, stack it for next step
|
||||
# Note to remove URLs in the citation as the plaintext citations can
|
||||
# contain URLs and they are bad for the CrossRef API.
|
||||
crossref_queue.append(tools.remove_URLs(citation))
|
||||
crossref_queue.append(tools.remove_urls(citation))
|
||||
|
||||
# Do batch with remaining papers, to prevent from the timeout of CrossRef
|
||||
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
|
||||
batch = [i for i in batch]
|
||||
try:
|
||||
# Fetch results from CrossRef
|
||||
r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
|
||||
for result in r.json()["results"]:
|
||||
request = requests.post(CROSSREF_LINKS_API_URL, json=batch)
|
||||
for result in request.json()["results"]:
|
||||
# Try to get a DOI
|
||||
try:
|
||||
dois[result["text"]] = result["doi"]
|
||||
|
@ -55,7 +55,7 @@ def is_valid(doi):
|
||||
False
|
||||
"""
|
||||
match = REGEX.match(doi)
|
||||
return ((match is not None) and (match.group(0) == doi))
|
||||
return (match is not None) and (match.group(0) == doi)
|
||||
|
||||
|
||||
def extract_from_text(text):
|
||||
@ -71,17 +71,17 @@ def extract_from_text(text):
|
||||
return tools.remove_duplicates(REGEX.findall(text))
|
||||
|
||||
|
||||
def to_URL(dois):
|
||||
def to_url(dois):
|
||||
"""
|
||||
Convert a list of canonical DOIs to a list of DOIs URLs.
|
||||
|
||||
:param dois: List of canonical DOIs. Can also be a single canonical DOI.
|
||||
:returns: A list of DOIs URLs (resp. a single value).
|
||||
|
||||
>>> to_URL(['10.1209/0295-5075/111/40005'])
|
||||
>>> to_url(['10.1209/0295-5075/111/40005'])
|
||||
['http://dx.doi.org/10.1209/0295-5075/111/40005']
|
||||
|
||||
>>> to_URL('10.1209/0295-5075/111/40005')
|
||||
>>> to_url('10.1209/0295-5075/111/40005')
|
||||
'http://dx.doi.org/10.1209/0295-5075/111/40005'
|
||||
"""
|
||||
if isinstance(dois, list):
|
||||
@ -110,13 +110,7 @@ def to_canonical(urls):
|
||||
>>> to_canonical(['aaaa']) is None
|
||||
True
|
||||
"""
|
||||
try:
|
||||
if isinstance(urls, list):
|
||||
return [next(iter(extract_from_text(url))) for url in urls]
|
||||
else:
|
||||
return next(iter(extract_from_text(urls)))
|
||||
except StopIteration:
|
||||
return None
|
||||
return tools.map_or_apply(extract_from_text, urls)
|
||||
|
||||
|
||||
def get_oa_version(doi):
|
||||
@ -134,10 +128,10 @@ def get_oa_version(doi):
|
||||
'http://arxiv.org/abs/1506.06690'
|
||||
"""
|
||||
try:
|
||||
r = requests.get("%s%s" % (DISSEMIN_API, doi))
|
||||
r.raise_for_status()
|
||||
result = r.json()
|
||||
assert(result["status"] == "ok")
|
||||
request = requests.get("%s%s" % (DISSEMIN_API, doi))
|
||||
request.raise_for_status()
|
||||
result = request.json()
|
||||
assert result["status"] == "ok"
|
||||
return result["paper"]["pdf_url"]
|
||||
except (AssertionError, ValueError, KeyError, RequestException):
|
||||
return None
|
||||
@ -162,10 +156,10 @@ def get_oa_policy(doi):
|
||||
True
|
||||
"""
|
||||
try:
|
||||
r = requests.get("%s%s" % (DISSEMIN_API, doi))
|
||||
r.raise_for_status()
|
||||
result = r.json()
|
||||
assert(result["status"] == "ok")
|
||||
request = requests.get("%s%s" % (DISSEMIN_API, doi))
|
||||
request.raise_for_status()
|
||||
result = request.json()
|
||||
assert result["status"] == "ok"
|
||||
return ([i
|
||||
for i in result["paper"]["publications"]
|
||||
if i["doi"] == doi][0])["policy"]
|
||||
@ -185,8 +179,8 @@ def get_linked_version(doi):
|
||||
'http://stacks.iop.org/0295-5075/111/i=4/a=40005?key=crossref.9ad851948a976ecdf216d4929b0b6f01'
|
||||
"""
|
||||
try:
|
||||
r = requests.head(to_URL(doi))
|
||||
return r.headers.get("location")
|
||||
request = requests.head(to_url(doi))
|
||||
return request.headers.get("location")
|
||||
except RequestException:
|
||||
return None
|
||||
|
||||
@ -206,10 +200,10 @@ def get_bibtex(doi):
|
||||
'@article{Verney_2015,\\n\\tdoi = {10.1209/0295-5075/111/40005},\\n\\turl = {http://dx.doi.org/10.1209/0295-5075/111/40005},\\n\\tyear = 2015,\\n\\tmonth = {aug},\\n\\tpublisher = {{IOP} Publishing},\\n\\tvolume = {111},\\n\\tnumber = {4},\\n\\tpages = {40005},\\n\\tauthor = {Lucas Verney and Lev Pitaevskii and Sandro Stringari},\\n\\ttitle = {Hybridization of first and second sound in a weakly interacting Bose gas},\\n\\tjournal = {{EPL}}\\n}'
|
||||
"""
|
||||
try:
|
||||
r = requests.get(to_URL(doi),
|
||||
headers={"accept": "application/x-bibtex"})
|
||||
r.raise_for_status()
|
||||
assert(r.headers.get("content-type") == "application/x-bibtex")
|
||||
return r.text
|
||||
request = requests.get(to_url(doi),
|
||||
headers={"accept": "application/x-bibtex"})
|
||||
request.raise_for_status()
|
||||
assert request.headers.get("content-type") == "application/x-bibtex"
|
||||
return request.text
|
||||
except (RequestException, AssertionError):
|
||||
return None
|
||||
|
@ -3,24 +3,89 @@ This file contains functions to download locally some papers, eventually using
|
||||
a proxy.
|
||||
"""
|
||||
import socket
|
||||
import socks
|
||||
import sys
|
||||
import urllib
|
||||
|
||||
import socks
|
||||
|
||||
|
||||
# Default socket to use, if no proxy is used
|
||||
DEFAULT_SOCKET = socket.socket
|
||||
|
||||
|
||||
def download(url, proxies=[None]):
|
||||
def _download_helper(url):
|
||||
"""
|
||||
Handle the download of an URL, using the proxy currently set in \
|
||||
:mod:`socks`.
|
||||
|
||||
:param url: The URL to download.
|
||||
:returns: A tuple of the raw content of the downloaded data and its \
|
||||
associated content-type. Returns None if it was \
|
||||
unable to download the document.
|
||||
"""
|
||||
# Try to fetch the URL using the current proxy
|
||||
try:
|
||||
request = urllib.request.urlopen(url)
|
||||
try:
|
||||
size = int(dict(request.info())['content-length'].strip())
|
||||
except KeyError:
|
||||
try:
|
||||
size = int(dict(request.info())['Content-Length'].strip())
|
||||
except KeyError:
|
||||
size = 0
|
||||
# Download the document
|
||||
doc = b""
|
||||
doc_size = 0
|
||||
while True:
|
||||
buf = request.read(1024)
|
||||
if buf:
|
||||
doc += buf
|
||||
doc_size += len(buf)
|
||||
if size != 0:
|
||||
# Write progress bar on stdout
|
||||
done = int(50 * doc_size / size)
|
||||
sys.stdout.write("\r[%s%s]" %
|
||||
('='*done, ' '*(50-done)))
|
||||
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
|
||||
sys.stdout.flush()
|
||||
else:
|
||||
break
|
||||
# Fetch content type
|
||||
contenttype = None
|
||||
contenttype_req = None
|
||||
try:
|
||||
contenttype_req = dict(request.info())['content-type']
|
||||
except KeyError:
|
||||
try:
|
||||
contenttype_req = dict(request.info())['Content-Type']
|
||||
except KeyError:
|
||||
return None
|
||||
if 'pdf' in contenttype_req:
|
||||
contenttype = 'pdf'
|
||||
elif 'djvu' in contenttype_req:
|
||||
contenttype = 'djvu'
|
||||
|
||||
# Check content type and status code are ok
|
||||
if request.getcode() != 200 or contenttype is None:
|
||||
# Else, try with the next available proxy
|
||||
return None
|
||||
|
||||
# Return a tuple of the downloaded content and the content-type
|
||||
return (doc, contenttype)
|
||||
# If an exception occurred, continue with next available proxy
|
||||
except (urllib.error.URLError, socket.error, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def download(url, proxies=None):
|
||||
"""
|
||||
Download a PDF or DJVU document from a url, eventually using proxies.
|
||||
|
||||
:params url: The URL to the PDF/DJVU document to fetch.
|
||||
:params proxies: An optional list of proxies to use. Proxies will be \
|
||||
used sequentially. Proxies should be a list of proxy strings. \
|
||||
Do not forget to include ``None`` in the list if you want to try \
|
||||
direct fetching without any proxy.
|
||||
Do not forget to include ``""`` (empty string) in the list if \
|
||||
you want to try direct fetching without any proxy.
|
||||
|
||||
:returns: A tuple of the raw content of the downloaded data and its \
|
||||
associated content-type. Returns ``(None, None)`` if it was \
|
||||
@ -28,10 +93,14 @@ def download(url, proxies=[None]):
|
||||
|
||||
>>> download("http://arxiv.org/pdf/1312.4006.pdf") # doctest: +SKIP
|
||||
"""
|
||||
# Handle default argument
|
||||
if proxies is None:
|
||||
proxies = [""]
|
||||
|
||||
# Loop over all available connections
|
||||
for proxy in proxies:
|
||||
# Handle no proxy case
|
||||
if proxy is None:
|
||||
if proxy == "":
|
||||
socket.socket = DEFAULT_SOCKET
|
||||
# Handle SOCKS proxy
|
||||
elif proxy.startswith('socks'):
|
||||
@ -55,58 +124,9 @@ def download(url, proxies=[None]):
|
||||
socks.set_default_proxy(socks.HTTP, proxy, port)
|
||||
socket.socket = socks.socksocket
|
||||
|
||||
# Try to fetch the URL using the current proxy
|
||||
try:
|
||||
r = urllib.request.urlopen(url)
|
||||
try:
|
||||
size = int(dict(r.info())['content-length'].strip())
|
||||
except KeyError:
|
||||
try:
|
||||
size = int(dict(r.info())['Content-Length'].strip())
|
||||
except KeyError:
|
||||
size = 0
|
||||
# Download the document
|
||||
dl = b""
|
||||
dl_size = 0
|
||||
while True:
|
||||
buf = r.read(1024)
|
||||
if buf:
|
||||
dl += buf
|
||||
dl_size += len(buf)
|
||||
if size != 0:
|
||||
# Write progress bar on stdout
|
||||
done = int(50 * dl_size / size)
|
||||
sys.stdout.write("\r[%s%s]" %
|
||||
('='*done, ' '*(50-done)))
|
||||
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
|
||||
sys.stdout.flush()
|
||||
else:
|
||||
break
|
||||
# Fetch content type
|
||||
contenttype = False
|
||||
contenttype_req = None
|
||||
try:
|
||||
contenttype_req = dict(r.info())['content-type']
|
||||
except KeyError:
|
||||
try:
|
||||
contenttype_req = dict(r.info())['Content-Type']
|
||||
except KeyError:
|
||||
continue
|
||||
if 'pdf' in contenttype_req:
|
||||
contenttype = 'pdf'
|
||||
elif 'djvu' in contenttype_req:
|
||||
contenttype = 'djvu'
|
||||
|
||||
# Check content type and status code are ok
|
||||
if r.getcode() != 200 or contenttype is False:
|
||||
# Else, try with the next available proxy
|
||||
continue
|
||||
|
||||
# Return a tuple of the downloaded content and the content-type
|
||||
return (dl, contenttype)
|
||||
# If an exception occurred, continue with next available proxy
|
||||
except (urllib.error.URLError, socket.error, ValueError):
|
||||
continue
|
||||
downloaded = _download_helper(url)
|
||||
if downloaded is not None:
|
||||
return downloaded
|
||||
|
||||
# In case of running out of proxies, return (None, None)
|
||||
return (None, None)
|
||||
|
@ -11,11 +11,11 @@ from libbmc import __valid_identifiers__
|
||||
__valid_identifiers__ += ["isbn"]
|
||||
|
||||
|
||||
def is_valid(isbn):
|
||||
def is_valid(isbn_id):
|
||||
"""
|
||||
Check that a given string is a valid ISBN.
|
||||
|
||||
:param isbn: the isbn to be checked.
|
||||
:param isbn_id: the isbn to be checked.
|
||||
:returns: boolean indicating whether the isbn is valid or not.
|
||||
|
||||
>>> is_valid("978-3-16-148410-0")
|
||||
@ -43,9 +43,9 @@ def is_valid(isbn):
|
||||
True
|
||||
"""
|
||||
return (
|
||||
(not isbnlib.notisbn(isbn)) and (
|
||||
isbnlib.get_canonical_isbn(isbn) == isbn or
|
||||
isbnlib.mask(isbnlib.get_canonical_isbn(isbn)) == isbn)
|
||||
(not isbnlib.notisbn(isbn_id)) and (
|
||||
isbnlib.get_canonical_isbn(isbn_id) == isbn_id or
|
||||
isbnlib.mask(isbnlib.get_canonical_isbn(isbn_id)) == isbn_id)
|
||||
)
|
||||
|
||||
|
||||
@ -64,28 +64,28 @@ def extract_from_text(text):
|
||||
return [i for i in isbns if i is not None]
|
||||
|
||||
|
||||
def get_bibtex(isbn):
|
||||
def get_bibtex(isbn_identifier):
|
||||
"""
|
||||
Get a BibTeX string for the given ISBN.
|
||||
|
||||
:param isbn: ISBN to fetch BibTeX entry for.
|
||||
:param isbn_identifier: ISBN to fetch BibTeX entry for.
|
||||
:returns: A BibTeX string or ``None`` if could not fetch it.
|
||||
|
||||
>>> get_bibtex('9783161484100')
|
||||
'@book{9783161484100,\\n title = {Berkeley, Oakland: Albany, Emeryville, Alameda, Kensington},\\n author = {Peekaboo Maps},\\n isbn = {9783161484100},\\n year = {2009},\\n publisher = {Peek A Boo Maps}\\n}'
|
||||
"""
|
||||
# Try to find the BibTeX using associated DOIs
|
||||
bibtex = doi.get_bibtex(to_DOI(isbn))
|
||||
bibtex = doi.get_bibtex(to_doi(isbn_identifier))
|
||||
if bibtex is None:
|
||||
# In some cases, there are no DOIs for a given ISBN. In this case, try
|
||||
# to fetch bibtex directly from the ISBN, using a combination of
|
||||
# Google Books and worldcat.org results.
|
||||
bibtex = isbnlib.registry.bibformatters['bibtex'](
|
||||
isbnlib.meta(isbn, 'default'))
|
||||
isbnlib.meta(isbn_identifier, 'default'))
|
||||
return bibtex
|
||||
|
||||
|
||||
def to_DOI(isbn):
|
||||
def to_doi(isbn_identifier):
|
||||
"""
|
||||
Make a DOI out of the given ISBN.
|
||||
|
||||
@ -94,16 +94,16 @@ def to_DOI(isbn):
|
||||
See https://github.com/xlcnd/isbnlib#note. The returned DOI may not be
|
||||
issued yet.
|
||||
|
||||
:param isbn: A valid ISBN string.
|
||||
:param isbn_identifier: A valid ISBN string.
|
||||
:returns: A DOI as string.
|
||||
|
||||
>>> to_DOI('9783161484100')
|
||||
>>> to_doi('9783161484100')
|
||||
'10.978.316/1484100'
|
||||
"""
|
||||
return isbnlib.doi(isbn)
|
||||
return isbnlib.doi(isbn_identifier)
|
||||
|
||||
|
||||
def from_DOI(doi):
|
||||
def from_doi(doi_identifier):
|
||||
"""
|
||||
Make an ISBN out of the given DOI.
|
||||
|
||||
@ -119,10 +119,10 @@ def from_DOI(doi):
|
||||
issued yet (it is a valid one, but not necessary corresponding to a
|
||||
valid book).
|
||||
|
||||
:param doi: A valid canonical DOI.
|
||||
:param doi_identifier: A valid canonical DOI.
|
||||
:returns: An ISBN string.
|
||||
|
||||
>>> from_DOI('10.978.316/1484100')
|
||||
>>> from_doi('10.978.316/1484100')
|
||||
'9783161484100'
|
||||
"""
|
||||
return "".join(c for c in doi[2:] if c in "0123456789xX")
|
||||
return "".join(c for c in doi_identifier[2:] if c in "0123456789xX")
|
||||
|
@ -1,6 +0,0 @@
|
||||
from libbmc.papers import identifiers
|
||||
|
||||
__all__ = [
|
||||
"identifiers",
|
||||
"tearpages"
|
||||
]
|
@ -13,8 +13,8 @@ import sys
|
||||
from libbmc import __valid_identifiers__
|
||||
|
||||
# Import all the modules associated to __valid_identifiers__
|
||||
for type in __valid_identifiers__:
|
||||
importlib.import_module("libbmc.%s" % (type,))
|
||||
for valid_identifier in __valid_identifiers__:
|
||||
importlib.import_module("libbmc.%s" % (valid_identifier,))
|
||||
|
||||
|
||||
def find_identifiers(src):
|
||||
@ -53,18 +53,19 @@ def find_identifiers(src):
|
||||
|
||||
while totext.poll() is None:
|
||||
extract_full = ' '.join([i.decode("utf-8").strip()
|
||||
for i in totext.stdout.readlines()])
|
||||
for i in totext.stdout.readlines()])
|
||||
# Loop over all the valid identifier types
|
||||
for type in __valid_identifiers__:
|
||||
for identifier in __valid_identifiers__:
|
||||
# Dynamically call the ``extract_from_text`` method for the
|
||||
# associated module.
|
||||
m = sys.modules.get("libbmc.%s" % (type,), None)
|
||||
if m is None:
|
||||
module = sys.modules.get("libbmc.%s" % (identifier,), None)
|
||||
if module is None:
|
||||
continue
|
||||
found_id = getattr(m, "extract_from_text")(extract_full)
|
||||
found_id = getattr(module, "extract_from_text")(extract_full)
|
||||
if found_id:
|
||||
totext.terminate()
|
||||
return (type, found_id[0]) # found_id is a list of found IDs
|
||||
# found_id is a list of found IDs
|
||||
return (identifier, found_id[0])
|
||||
return (None, None)
|
||||
|
||||
|
||||
@ -80,12 +81,12 @@ def get_bibtex(identifier):
|
||||
:returns: A BibTeX string or ``None`` if an error occurred.
|
||||
# TODO: Should return a BiBTeX object?
|
||||
"""
|
||||
type, id = identifier
|
||||
if type not in __valid_identifiers__:
|
||||
identifier_type, identifier_id = identifier
|
||||
if identifier_type not in __valid_identifiers__:
|
||||
return None
|
||||
|
||||
# Dynamically call the ``get_bibtex`` method from the associated module.
|
||||
m = sys.modules.get("libbmc.%s" % (type,), None)
|
||||
if m is None:
|
||||
module = sys.modules.get("libbmc.%s" % (identifier_type,), None)
|
||||
if module is None:
|
||||
return None
|
||||
return getattr(m, "get_bibtex")(id)
|
||||
return getattr(module, "get_bibtex")(identifier_id)
|
||||
|
@ -21,7 +21,7 @@ BAD_JOURNALS = {
|
||||
}
|
||||
|
||||
|
||||
def fixPdf(pdfFile, destination):
|
||||
def fix_pdf(pdf_file, destination):
|
||||
"""
|
||||
Fix malformed pdf files when data are present after '%%EOF'
|
||||
|
||||
@ -33,18 +33,16 @@ def fixPdf(pdfFile, destination):
|
||||
:param destination: destination
|
||||
"""
|
||||
tmp = tempfile.NamedTemporaryFile()
|
||||
output = open(tmp.name, 'wb')
|
||||
with open(pdfFile, "rb") as fh:
|
||||
with open(pdfFile, "rb") as fh:
|
||||
with open(tmp.name, 'wb') as output:
|
||||
with open(pdf_file, "rb") as fh:
|
||||
for line in fh:
|
||||
output.write(line)
|
||||
if b'%%EOF' in line:
|
||||
break
|
||||
output.close()
|
||||
shutil.copy(tmp.name, destination)
|
||||
|
||||
|
||||
def tearpage_backend(filename, teared_pages=[0]):
|
||||
def tearpage_backend(filename, teared_pages=None):
|
||||
"""
|
||||
Copy filename to a tempfile, write pages to filename except the teared one.
|
||||
|
||||
@ -56,29 +54,35 @@ def tearpage_backend(filename, teared_pages=[0]):
|
||||
:param teared_pages: Numbers of the pages to tear. Default to first page \
|
||||
only.
|
||||
"""
|
||||
# Handle default argument
|
||||
if teared_pages is None:
|
||||
teared_pages = [0]
|
||||
|
||||
# Copy the pdf to a tmp file
|
||||
tmp = tempfile.NamedTemporaryFile()
|
||||
shutil.copy(filename, tmp.name)
|
||||
with tempfile.NamedTemporaryFile() as tmp:
|
||||
# Copy the input file to tmp
|
||||
shutil.copy(filename, tmp.name)
|
||||
|
||||
# Read the copied pdf
|
||||
try:
|
||||
input_file = PdfFileReader(open(tmp.name, 'rb'))
|
||||
except PdfReadError:
|
||||
fixPdf(filename, tmp.name)
|
||||
input_file = PdfFileReader(open(tmp.name, 'rb'))
|
||||
# Seek for the number of pages
|
||||
num_pages = input_file.getNumPages()
|
||||
# Read the copied pdf
|
||||
# TODO: Use with syntax
|
||||
try:
|
||||
input_file = PdfFileReader(open(tmp.name, 'rb'))
|
||||
except PdfReadError:
|
||||
fix_pdf(filename, tmp.name)
|
||||
input_file = PdfFileReader(open(tmp.name, 'rb'))
|
||||
# Seek for the number of pages
|
||||
num_pages = input_file.getNumPages()
|
||||
|
||||
# Write pages excepted the first one
|
||||
output_file = PdfFileWriter()
|
||||
for i in range(num_pages):
|
||||
if i in teared_pages:
|
||||
continue
|
||||
output_file.addPage(input_file.getPage(i))
|
||||
# Write pages excepted the first one
|
||||
output_file = PdfFileWriter()
|
||||
for i in range(num_pages):
|
||||
if i in teared_pages:
|
||||
continue
|
||||
output_file.addPage(input_file.getPage(i))
|
||||
|
||||
tmp.close()
|
||||
outputStream = open(filename, "wb")
|
||||
output_file.write(outputStream)
|
||||
tmp.close()
|
||||
outputStream = open(filename, "wb")
|
||||
output_file.write(outputStream)
|
||||
|
||||
|
||||
def tearpage_needed(bibtex):
|
||||
@ -89,16 +93,16 @@ def tearpage_needed(bibtex):
|
||||
whether tearing is needed.
|
||||
:returns: A list of pages to tear.
|
||||
"""
|
||||
for p in BAD_JOURNALS:
|
||||
if p in bibtex.get("journal", "").lower():
|
||||
for publisher in BAD_JOURNALS:
|
||||
if publisher in bibtex.get("journal", "").lower():
|
||||
# Bad journal is found, add pages to tear
|
||||
return BAD_JOURNALS[p]
|
||||
return BAD_JOURNALS[publisher]
|
||||
|
||||
# If no bad journals are found, return an empty list
|
||||
return []
|
||||
|
||||
|
||||
def tearpage(filename, bibtex=None, force=False):
|
||||
def tearpage(filename, bibtex=None, force=None):
|
||||
"""
|
||||
Tear some pages of the file if needed.
|
||||
|
||||
@ -112,7 +116,7 @@ def tearpage(filename, bibtex=None, force=False):
|
||||
"""
|
||||
# Fetch pages to tear
|
||||
pages_to_tear = []
|
||||
if force is not False:
|
||||
if force is not None:
|
||||
pages_to_tear = force
|
||||
elif bibtex is not None:
|
||||
pages_to_tear = tearpage_needed(bibtex)
|
||||
|
@ -1,5 +0,0 @@
|
||||
from libbmc.repositories import arxiv, hal
|
||||
|
||||
__all__ = [
|
||||
"arxiv", "hal"
|
||||
]
|
@ -1,15 +1,18 @@
|
||||
"""
|
||||
This file contains all the arXiv-related functions.
|
||||
"""
|
||||
import arxiv2bib
|
||||
import bibtexparser
|
||||
import io
|
||||
import re
|
||||
import requests
|
||||
import tarfile
|
||||
import xml.etree.ElementTree
|
||||
|
||||
from urllib.error import HTTPError
|
||||
|
||||
|
||||
import arxiv2bib
|
||||
import bibtexparser
|
||||
import requests
|
||||
|
||||
from requests.exceptions import RequestException
|
||||
|
||||
|
||||
@ -268,7 +271,7 @@ def is_valid(arxiv_id):
|
||||
False
|
||||
"""
|
||||
match = REGEX.match(arxiv_id)
|
||||
return ((match is not None) and (match.group(0) == arxiv_id))
|
||||
return (match is not None) and (match.group(0) == arxiv_id)
|
||||
|
||||
|
||||
def get_bibtex(arxiv_id):
|
||||
@ -320,17 +323,17 @@ def extract_from_text(text):
|
||||
for i in REGEX.findall(text) if i[0] != ''])
|
||||
|
||||
|
||||
def to_URL(arxiv_ids):
|
||||
def to_url(arxiv_ids):
|
||||
"""
|
||||
Convert a list of canonical DOIs to a list of DOIs URLs.
|
||||
|
||||
:param dois: List of canonical DOIs.
|
||||
:returns: A list of DOIs URLs.
|
||||
|
||||
>>> to_URL('1506.06690')
|
||||
>>> to_url('1506.06690')
|
||||
'http://arxiv.org/abs/1506.06690'
|
||||
|
||||
>>> to_URL('1506.06690v1')
|
||||
>>> to_url('1506.06690v1')
|
||||
'http://arxiv.org/abs/1506.06690v1'
|
||||
"""
|
||||
if isinstance(arxiv_ids, list):
|
||||
@ -358,16 +361,10 @@ def to_canonical(urls):
|
||||
>>> to_canonical('aaa') is None
|
||||
True
|
||||
"""
|
||||
try:
|
||||
if isinstance(urls, list):
|
||||
return [next(iter(extract_from_text(url))) for url in urls]
|
||||
else:
|
||||
return next(iter(extract_from_text(urls)))
|
||||
except StopIteration:
|
||||
return None
|
||||
return tools.map_or_apply(extract_from_text, urls)
|
||||
|
||||
|
||||
def from_DOI(doi):
|
||||
def from_doi(doi):
|
||||
"""
|
||||
Get the arXiv eprint id for a given DOI.
|
||||
|
||||
@ -379,29 +376,29 @@ def from_DOI(doi):
|
||||
:param doi: The DOI of the resource to look for.
|
||||
:returns: The arXiv eprint id, or ``None`` if not found.
|
||||
|
||||
>>> from_DOI('10.1209/0295-5075/111/40005')
|
||||
>>> from_doi('10.1209/0295-5075/111/40005')
|
||||
# Note: Test do not pass due to an arXiv API bug.
|
||||
'1506.06690'
|
||||
"""
|
||||
try:
|
||||
r = requests.get("http://export.arxiv.org/api/query",
|
||||
params={
|
||||
"search_query": "doi:%s" % (doi,),
|
||||
"max_results": 1
|
||||
})
|
||||
r.raise_for_status()
|
||||
request = requests.get("http://export.arxiv.org/api/query",
|
||||
params={
|
||||
"search_query": "doi:%s" % (doi,),
|
||||
"max_results": 1
|
||||
})
|
||||
request.raise_for_status()
|
||||
except RequestException:
|
||||
return None
|
||||
e = xml.etree.ElementTree.fromstring(r.content)
|
||||
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
||||
id = entry.find("{http://www.w3.org/2005/Atom}id").text
|
||||
# id is an arXiv full URL. We only want the id which is the last URL
|
||||
# component.
|
||||
return id.split("/")[-1]
|
||||
root = xml.etree.ElementTree.fromstring(request.content)
|
||||
for entry in root.iter("{http://www.w3.org/2005/Atom}entry"):
|
||||
arxiv_id = entry.find("{http://www.w3.org/2005/Atom}id").text
|
||||
# arxiv_id is an arXiv full URL. We only want the id which is the last
|
||||
# URL component.
|
||||
return arxiv_id.split("/")[-1]
|
||||
return None
|
||||
|
||||
|
||||
def to_DOI(arxiv_id):
|
||||
def to_doi(arxiv_id):
|
||||
"""
|
||||
Get the associated DOI for a given arXiv eprint.
|
||||
|
||||
@ -413,23 +410,23 @@ def to_DOI(arxiv_id):
|
||||
:param eprint: The arXiv eprint id.
|
||||
:returns: The DOI if any, or ``None``.
|
||||
|
||||
>>> to_DOI('1506.06690v1')
|
||||
>>> to_doi('1506.06690v1')
|
||||
'10.1209/0295-5075/111/40005'
|
||||
|
||||
>>> to_DOI('1506.06690')
|
||||
>>> to_doi('1506.06690')
|
||||
'10.1209/0295-5075/111/40005'
|
||||
"""
|
||||
try:
|
||||
r = requests.get("http://export.arxiv.org/api/query",
|
||||
params={
|
||||
"id_list": arxiv_id,
|
||||
"max_results": 1
|
||||
})
|
||||
r.raise_for_status()
|
||||
request = requests.get("http://export.arxiv.org/api/query",
|
||||
params={
|
||||
"id_list": arxiv_id,
|
||||
"max_results": 1
|
||||
})
|
||||
request.raise_for_status()
|
||||
except RequestException:
|
||||
return None
|
||||
e = xml.etree.ElementTree.fromstring(r.content)
|
||||
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
||||
root = xml.etree.ElementTree.fromstring(request.content)
|
||||
for entry in root.iter("{http://www.w3.org/2005/Atom}entry"):
|
||||
doi = entry.find("{http://arxiv.org/schemas/atom}doi")
|
||||
if doi is not None:
|
||||
return doi.text
|
||||
@ -451,9 +448,9 @@ def get_sources(arxiv_id):
|
||||
``None``.
|
||||
"""
|
||||
try:
|
||||
r = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))
|
||||
r.raise_for_status()
|
||||
file_object = io.BytesIO(r.content)
|
||||
request = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))
|
||||
request.raise_for_status()
|
||||
file_object = io.BytesIO(request.content)
|
||||
return tarfile.open(fileobj=file_object)
|
||||
except (RequestException, AssertionError, tarfile.TarError):
|
||||
return None
|
||||
@ -473,9 +470,9 @@ def get_bbl(arxiv_id):
|
||||
:returns: A list of the full text of the ``.bbl`` files (if any) \
|
||||
or ``None``.
|
||||
"""
|
||||
tf = get_sources(arxiv_id)
|
||||
bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
|
||||
bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
|
||||
tar_file = get_sources(arxiv_id)
|
||||
bbl_files = [i for i in tar_file.getmembers() if i.name.endswith(".bbl")]
|
||||
bbl_files = [tar_file.extractfile(member).read().decode(tarfile.ENCODING)
|
||||
for member in bbl_files]
|
||||
return bbl_files
|
||||
|
||||
@ -498,5 +495,5 @@ def get_citations(arxiv_id):
|
||||
bbl_files = get_bbl(arxiv_id)
|
||||
for bbl_file in bbl_files:
|
||||
# Fetch the cited DOIs for each of the bbl files
|
||||
dois.update(bbl.get_cited_DOIs(bbl_file))
|
||||
dois.update(bbl.get_cited_dois(bbl_file))
|
||||
return dois
|
||||
|
@ -33,7 +33,7 @@ def is_valid(hal_id):
|
||||
False
|
||||
"""
|
||||
match = REGEX.match(hal_id)
|
||||
return ((match is not None) and (match.group(0) == hal_id))
|
||||
return (match is not None) and (match.group(0) == hal_id)
|
||||
|
||||
|
||||
def extract_from_text(text):
|
||||
|
@ -9,9 +9,11 @@ from itertools import islice, chain
|
||||
|
||||
# Huge URL regex taken from https://gist.github.com/gruber/8891611
|
||||
URL_REGEX = re.compile(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))")
|
||||
_SLUGIFY_STRIP_RE = re.compile(r'[^\w\s-]')
|
||||
_SLUGIFY_HYPHENATE_RE = re.compile(r'[\s]+')
|
||||
|
||||
|
||||
def replaceAll(text, replace_dict):
|
||||
def replace_all(text, replace_dict):
|
||||
"""
|
||||
Replace multiple strings in a text.
|
||||
|
||||
@ -26,7 +28,7 @@ def replaceAll(text, replace_dict):
|
||||
substitution.
|
||||
:returns: Text after replacements.
|
||||
|
||||
>>> replaceAll("foo bar foo thing", {"foo": "oof", "bar": "rab"})
|
||||
>>> replace_all("foo bar foo thing", {"foo": "oof", "bar": "rab"})
|
||||
'oof rab oof thing'
|
||||
"""
|
||||
for i, j in replace_dict.items():
|
||||
@ -34,6 +36,24 @@ def replaceAll(text, replace_dict):
|
||||
return text
|
||||
|
||||
|
||||
def map_or_apply(function, param):
|
||||
"""
|
||||
Map the function on ``param``, or apply it, depending whether ``param`` \
|
||||
is a list or an item.
|
||||
|
||||
:param function: The function to apply.
|
||||
:param param: The parameter to feed the function with (list or item).
|
||||
:returns: The computed value or ``None``.
|
||||
"""
|
||||
try:
|
||||
if isinstance(param, list):
|
||||
return [next(iter(function(i))) for i in param]
|
||||
else:
|
||||
return next(iter(function(param)))
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
|
||||
def clean_whitespaces(text):
|
||||
"""
|
||||
Remove multiple whitespaces from text. Also removes leading and trailing \
|
||||
@ -85,13 +105,13 @@ def batch(iterable, size):
|
||||
>>> [list(i) for i in batch([1, 2, 3, 4, 5], 2)]
|
||||
[[1, 2], [3, 4], [5]]
|
||||
"""
|
||||
it = iter(iterable)
|
||||
item = iter(iterable)
|
||||
while True:
|
||||
bi = islice(it, size)
|
||||
yield chain([next(bi)], bi)
|
||||
batch_iterator = islice(item, size)
|
||||
yield chain([next(batch_iterator)], batch_iterator)
|
||||
|
||||
|
||||
def remove_URLs(text):
|
||||
def remove_urls(text):
|
||||
"""
|
||||
Remove URLs from a given text (only removes http, https and naked domains \
|
||||
URLs).
|
||||
@ -99,16 +119,12 @@ def remove_URLs(text):
|
||||
:param text: The text to remove URLs from.
|
||||
:returns: The text without URLs.
|
||||
|
||||
>>> remove_URLs("foobar http://example.com https://example.com foobar")
|
||||
>>> remove_urls("foobar http://example.com https://example.com foobar")
|
||||
'foobar foobar'
|
||||
"""
|
||||
return clean_whitespaces(URL_REGEX.sub("", text))
|
||||
|
||||
|
||||
_slugify_strip_re = re.compile(r'[^\w\s-]')
|
||||
_slugify_hyphenate_re = re.compile(r'[\s]+')
|
||||
|
||||
|
||||
def slugify(value):
|
||||
"""
|
||||
Normalizes string, converts to lowercase, removes non-alpha characters,
|
||||
@ -127,5 +143,5 @@ def slugify(value):
|
||||
value = unicode_type(value)
|
||||
value = (unicodedata.normalize('NFKD', value).
|
||||
encode('ascii', 'ignore').decode('ascii'))
|
||||
value = unicode_type(_slugify_strip_re.sub('', value).strip())
|
||||
return _slugify_hyphenate_re.sub('_', value)
|
||||
value = unicode_type(_SLUGIFY_STRIP_RE.sub('', value).strip())
|
||||
return _SLUGIFY_HYPHENATE_RE.sub('_', value)
|
||||
|
Loading…
Reference in New Issue
Block a user