Reimport bbl citations parsing and make some minor fixes
This commit is contained in:
parent
97eb5a3ae0
commit
d8b74ae356
125
libbmc/citations/bbl.py
Normal file
125
libbmc/citations/bbl.py
Normal file
@ -0,0 +1,125 @@
|
||||
"""
|
||||
This files contains all the functions to deal with .bbl files.
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
import subprocess
|
||||
|
||||
from requests.exception import RequestException
|
||||
|
||||
from libbmc import doi
|
||||
from libbmc import tools
|
||||
from libbmc.repositories import arxiv
|
||||
|
||||
|
||||
# Regex to match bibitems
|
||||
BIBITEMS_REGEX = re.compile(r"\\bibitem\{.+?\}")
|
||||
# Regex to match end of bibliography
|
||||
ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*")
|
||||
|
||||
|
||||
# CrossRef API URL
|
||||
CROSSREF_LINKS_API_URL = "http://search.crossref.org/links"
|
||||
CROSSREF_MAX_BATCH_SIZE = 10
|
||||
|
||||
|
||||
def bibitem_as_plaintext(bibitem):
|
||||
"""
|
||||
Return a plaintext representation of the bibitem from the ``.bbl`` file.
|
||||
|
||||
.. note::
|
||||
|
||||
This plaintext representation can be super ugly, contain URLs and so \
|
||||
on.
|
||||
|
||||
:param bibitem: The text content of the bibitem.
|
||||
:returns: A cleaned plaintext citation from the bibitem.
|
||||
"""
|
||||
try:
|
||||
output = subprocess.check_output(["delatex",
|
||||
"-s"],
|
||||
input=bibitem.encode("utf-8"))
|
||||
except FileNotFoundError:
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
output = subprocess.check_output(["%s/../external/opendetex/delatex" %
|
||||
(script_dir,),
|
||||
"-s"],
|
||||
input=bibitem.encode("utf-8"))
|
||||
output = output.decode("utf-8")
|
||||
output = tools.clean_whitespaces(output)
|
||||
return output
|
||||
|
||||
|
||||
def get_plaintext_citations(bbl):
|
||||
"""
|
||||
Parse a ``*.bbl`` file to get a clean list of plaintext citations.
|
||||
|
||||
:param bbl: Either the path to the .bbl file or the content of a ``.bbl`` \
|
||||
file.
|
||||
:returns: A list of cleaned plaintext citations.
|
||||
"""
|
||||
# Handle path or content
|
||||
if os.path.isfile(bbl):
|
||||
with open(bbl, 'r') as fh:
|
||||
bbl_content = fh.read()
|
||||
else:
|
||||
bbl_content = bbl
|
||||
# Get a list of bibitems, taking the first item out as it is *before* the
|
||||
# first \bibitem
|
||||
bibitems = BIBITEMS_REGEX.split(bbl_content)[1:]
|
||||
# Delete the text after the \end{thebibliography}
|
||||
bibitems = [ENDTHEBIBLIOGRAPHY_REGEX.sub("", i).strip() for i in bibitems]
|
||||
# Clean every bibitem to have plaintext
|
||||
cleaned_bbl = [bibitem_as_plaintext(bibitem) for bibitem in bibitems]
|
||||
return cleaned_bbl
|
||||
|
||||
|
||||
def get_cited_DOIs(bbl):
|
||||
"""
|
||||
Get the DOIs of the papers cited in this .bbl file.
|
||||
|
||||
:param bbl: Either the path to a .bbl file or the content of a .bbl file.
|
||||
|
||||
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
||||
"""
|
||||
dois = {}
|
||||
crossref_queue = []
|
||||
# Get the plaintext citations from the bbl file
|
||||
plaintext_citations = get_plaintext_citations(bbl)
|
||||
# Try to get the DOI directly from the citation
|
||||
for citation in plaintext_citations[:]:
|
||||
# Some citations already contain a DOI so try to match it directly
|
||||
matched_DOIs = doi.extract_from_text(citation)
|
||||
if matched_DOIs is not None:
|
||||
# Add the DOI and go on
|
||||
dois[citation] = matched_DOIs[0]
|
||||
continue
|
||||
# Same thing for arXiv id
|
||||
matched_arXiv = arxiv.extract_from_text(citation)
|
||||
if matched_arXiv is not None:
|
||||
# Add the associated DOI and go on
|
||||
dois[citation] = arxiv.to_DOI(matched_arXiv[0])
|
||||
continue
|
||||
# If no match found, stack it for next step
|
||||
# Note to remove URLs in the citation as the plaintext citations can
|
||||
# contain URLs and they are bad for the CrossRef API.
|
||||
crossref_queue.append(tools.remove_URLs(citation))
|
||||
# Do batch of papers, to prevent from the timeout of crossref
|
||||
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
|
||||
try:
|
||||
# Fetch results from CrossRef
|
||||
r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
|
||||
for result in r.json()["results"]:
|
||||
# Try to get a DOI
|
||||
try:
|
||||
dois[result["text"]] = result["doi"]
|
||||
except KeyError:
|
||||
# Or set it to None
|
||||
dois[result["text"]] = None
|
||||
except (RequestException, ValueError, KeyError):
|
||||
# If an exception occurred, set all the DOIs to None for the
|
||||
# current batch
|
||||
for i in batch:
|
||||
dois[i] = None
|
||||
return dois
|
@ -4,14 +4,16 @@ This file contains all the DOI-related functions.
|
||||
import re
|
||||
import requests
|
||||
|
||||
from requests.exception import RequestException
|
||||
|
||||
from libbmc import tools
|
||||
|
||||
# Taken from
|
||||
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
|
||||
regex = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b",
|
||||
REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b",
|
||||
re.IGNORECASE)
|
||||
# Base dx.doi.org URL for redirections
|
||||
dx_url = "http://dx.doi.org/{doi}"
|
||||
DX_URL = "http://dx.doi.org/{doi}"
|
||||
|
||||
|
||||
def is_valid(doi):
|
||||
@ -21,7 +23,7 @@ def is_valid(doi):
|
||||
:param doi: The DOI to be checked.
|
||||
:returns: Boolean indicating whether the DOI is valid or not.
|
||||
"""
|
||||
match = regex.match(doi)
|
||||
match = REGEX.match(doi)
|
||||
return ((match is not None) and (match.group(0) == doi))
|
||||
|
||||
|
||||
@ -32,7 +34,7 @@ def extract_from_text(text):
|
||||
:param text: The text to extract DOIs from.
|
||||
:returns: A list of found DOIs.
|
||||
"""
|
||||
return tools.remove_duplicates(regex.findall(text))
|
||||
return tools.remove_duplicates(REGEX.findall(text))
|
||||
|
||||
|
||||
def to_URL(dois):
|
||||
@ -43,9 +45,9 @@ def to_URL(dois):
|
||||
:returns: A list of DOIs URLs.
|
||||
"""
|
||||
if isinstance(dois, list):
|
||||
return [dx_url.format(doi=doi) for doi in dois]
|
||||
return [DX_URL.format(doi=doi) for doi in dois]
|
||||
else:
|
||||
return dx_url.format(doi=dois)
|
||||
return DX_URL.format(doi=dois)
|
||||
|
||||
|
||||
def to_canonical(urls):
|
||||
@ -73,13 +75,13 @@ def get_oa_version(doi):
|
||||
:returns: The URL of the OA version of the given DOI, or ``None``.
|
||||
"""
|
||||
# If DOI is a link, truncate it
|
||||
r = requests.get("http://beta.dissem.in/api/%s" % (doi,))
|
||||
try:
|
||||
assert(r.status_code == requests.codes.ok)
|
||||
r = requests.get("http://beta.dissem.in/api/%s" % (doi,))
|
||||
r.raise_for_status()
|
||||
result = r.json()
|
||||
assert(result["status"] == "ok")
|
||||
return result["paper"]["pdf_url"]
|
||||
except (AssertionError, ValueError, KeyError):
|
||||
except (AssertionError, ValueError, KeyError, RequestException):
|
||||
return None
|
||||
|
||||
|
||||
@ -90,8 +92,11 @@ def get_linked_version(doi):
|
||||
:param doi: A canonical DOI.
|
||||
:returns: The canonical URL behind the DOI, or ``None``.
|
||||
"""
|
||||
r = requests.head(to_URL(doi))
|
||||
return r.headers.get("location")
|
||||
try:
|
||||
r = requests.head(to_URL(doi))
|
||||
return r.headers.get("location")
|
||||
except RequestException:
|
||||
return None
|
||||
|
||||
|
||||
def get_bibtex(doi):
|
||||
@ -105,9 +110,11 @@ def get_bibtex(doi):
|
||||
:param doi: The canonical DOI to get BibTeX from.
|
||||
:returns: A BibTeX string or ``None``.
|
||||
"""
|
||||
r = requests.get(to_URL(doi),
|
||||
headers={"accept": "application/x-bibtex"})
|
||||
if r.headers.get("content-type") == "application/x-bibtex":
|
||||
try:
|
||||
r = requests.get(to_URL(doi),
|
||||
headers={"accept": "application/x-bibtex"})
|
||||
r.raise_for_status()
|
||||
assert(r.headers.get("content-type") == "application/x-bibtex")
|
||||
return r.text
|
||||
else:
|
||||
except (RequestException, AssertionError):
|
||||
return None
|
||||
|
@ -35,10 +35,10 @@ def get_bibtex(isbn):
|
||||
:param isbn: ISBN to fetch BibTeX entry for.
|
||||
:returns: A BibTeX string.
|
||||
"""
|
||||
return doi.get_bibtex(to_doi(isbn))
|
||||
return doi.get_bibtex(to_DOI(isbn))
|
||||
|
||||
|
||||
def to_doi(isbn):
|
||||
def to_DOI(isbn):
|
||||
"""
|
||||
Try to fetch a DOI from a given ISBN.
|
||||
|
||||
|
@ -1,17 +1,23 @@
|
||||
"""
|
||||
This file contains all the arXiv-related functions.
|
||||
"""
|
||||
import arxiv2bib
|
||||
import io
|
||||
import re
|
||||
import requests
|
||||
import tarfile
|
||||
import xml.etree.ElementTree
|
||||
|
||||
from urllib.error import HTTPError
|
||||
from requests.exception import RequestException
|
||||
|
||||
|
||||
from libbmc import tools
|
||||
from libbmc.citations import bbl
|
||||
|
||||
|
||||
arxiv_identifier_from_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
|
||||
arxiv_identifier_before_2007 = r"(" + ("|".join([
|
||||
ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
|
||||
ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([
|
||||
"astro-ph.GA",
|
||||
"astro-ph.CO",
|
||||
"astro-ph.EP",
|
||||
@ -159,15 +165,15 @@ arxiv_identifier_before_2007 = r"(" + ("|".join([
|
||||
"stat.ME",
|
||||
"stat.OT",
|
||||
"stat.TH"])) + r")/\d+"
|
||||
regex = re.compile(
|
||||
"(" + arxiv_identifier_from_2007 + ")|(" +
|
||||
arxiv_identifier_before_2007 + ")",
|
||||
REGEX = re.compile(
|
||||
"(" + ARXIV_IDENTIFIER_FROM_2007 + ")|(" +
|
||||
ARXIV_IDENTIFIER_BEFORE_2007 + ")",
|
||||
re.IGNORECASE)
|
||||
|
||||
# Base arXiv URL used as id sometimes
|
||||
arxiv_url = "http://arxiv.org/abs/{arxiv_id}"
|
||||
ARXIV_URL = "http://arxiv.org/abs/{arxiv_id}"
|
||||
# Eprint URL used to download sources
|
||||
arxiv_eprint_url = "http://arxiv.org/e-print/{arxiv_id}"
|
||||
ARXIV_EPRINT_URL = "http://arxiv.org/e-print/{arxiv_id}"
|
||||
|
||||
|
||||
def is_valid(arxiv_id):
|
||||
@ -177,15 +183,35 @@ def is_valid(arxiv_id):
|
||||
:param arxiv_id: The arXiv ID to be checked.
|
||||
:returns: Boolean indicating whether the arXiv ID is valid or not.
|
||||
"""
|
||||
match = regex.match(arxiv_id)
|
||||
match = REGEX.match(arxiv_id)
|
||||
return ((match is not None) and (match.group(0) == arxiv_id))
|
||||
|
||||
|
||||
def get_bibtex(arxiv_id):
|
||||
"""
|
||||
TODO
|
||||
Get a BibTeX entry for a given DOI.
|
||||
|
||||
.. note::
|
||||
|
||||
Using awesome https://pypi.python.org/pypi/arxiv2bib/ module.
|
||||
|
||||
:param arxiv_id: The canonical arXiv id to get BibTeX from.
|
||||
:returns: A BibTeX string or ``None``.
|
||||
"""
|
||||
assert(False)
|
||||
# Fetch bibtex using arxiv2bib module
|
||||
try:
|
||||
bibtex = arxiv2bib.arxiv2bib([arxiv_id])
|
||||
except HTTPError:
|
||||
bibtex = []
|
||||
|
||||
for bib in bibtex:
|
||||
if isinstance(bib, arxiv2bib.ReferenceErrorInfo):
|
||||
continue
|
||||
else:
|
||||
# Return fetched bibtex
|
||||
return bib.bibtex()
|
||||
# An error occurred, return None
|
||||
return None
|
||||
|
||||
|
||||
def extract_from_text(text):
|
||||
@ -195,7 +221,7 @@ def extract_from_text(text):
|
||||
:param text: The text to extract arXiv IDs from.
|
||||
:returns: A list of matching arXiv IDs.
|
||||
"""
|
||||
return tools.remove_duplicates(regex.findall(text))
|
||||
return tools.remove_duplicates(REGEX.findall(text))
|
||||
|
||||
|
||||
def to_URL(arxiv_ids):
|
||||
@ -206,9 +232,9 @@ def to_URL(arxiv_ids):
|
||||
:returns: A list of DOIs URLs.
|
||||
"""
|
||||
if isinstance(arxiv_ids, list):
|
||||
return [arxiv_url.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]
|
||||
return [ARXIV_URL.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]
|
||||
else:
|
||||
return arxiv_url.format(arxiv_id=arxiv_ids)
|
||||
return ARXIV_URL.format(arxiv_id=arxiv_ids)
|
||||
|
||||
|
||||
def to_canonical(urls):
|
||||
@ -236,11 +262,15 @@ def from_doi(doi):
|
||||
:param doi: The DOI of the resource to look for.
|
||||
:returns: The arXiv eprint id, or ``None`` if not found.
|
||||
"""
|
||||
r = requests.get("http://export.arxiv.org/api/query",
|
||||
params={
|
||||
"search_query": "doi:%s" % (doi,),
|
||||
"max_results": 1
|
||||
})
|
||||
try:
|
||||
r = requests.get("http://export.arxiv.org/api/query",
|
||||
params={
|
||||
"search_query": "doi:%s" % (doi,),
|
||||
"max_results": 1
|
||||
})
|
||||
r.raise_for_status()
|
||||
except RequestException:
|
||||
return None
|
||||
e = xml.etree.ElementTree.fromstring(r.content)
|
||||
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
||||
id = entry.find("{http://www.w3.org/2005/Atom}id").text
|
||||
@ -250,7 +280,7 @@ def from_doi(doi):
|
||||
return None
|
||||
|
||||
|
||||
def to_doi(arxiv_id):
|
||||
def to_DOI(arxiv_id):
|
||||
"""
|
||||
Get the associated DOI for a given arXiv eprint.
|
||||
|
||||
@ -262,11 +292,15 @@ def to_doi(arxiv_id):
|
||||
:param eprint: The arXiv eprint id.
|
||||
:returns: The DOI if any, or ``None``.
|
||||
"""
|
||||
r = requests.get("http://export.arxiv.org/api/query",
|
||||
params={
|
||||
"id_list": arxiv_id,
|
||||
"max_results": 1
|
||||
})
|
||||
try:
|
||||
r = requests.get("http://export.arxiv.org/api/query",
|
||||
params={
|
||||
"id_list": arxiv_id,
|
||||
"max_results": 1
|
||||
})
|
||||
r.raise_for_status()
|
||||
except RequestException:
|
||||
return None
|
||||
e = xml.etree.ElementTree.fromstring(r.content)
|
||||
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
||||
doi = entry.find("{http://arxiv.org/schemas/atom}doi")
|
||||
@ -284,12 +318,12 @@ def get_sources(arxiv_id):
|
||||
:returns: A ``TarFile`` object of the sources of the arXiv preprint or \
|
||||
``None``.
|
||||
"""
|
||||
r = requests.get(arxiv_eprint_url.format(arxiv_id=arxiv_id))
|
||||
try:
|
||||
assert(r.status_code == requests.codes.ok)
|
||||
r = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))
|
||||
r.raise_for_status()
|
||||
file_object = io.BytesIO(r.content)
|
||||
return tarfile.open(fileobj=file_object)
|
||||
except (AssertionError, tarfile.TarError):
|
||||
except (RequestException, AssertionError, tarfile.TarError):
|
||||
return None
|
||||
|
||||
|
||||
@ -297,8 +331,8 @@ def get_bbl(arxiv_id):
|
||||
"""
|
||||
Get the .bbl files (if any) of a given preprint.
|
||||
|
||||
:param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \
|
||||
canonical form.
|
||||
:param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
|
||||
a canonical form.
|
||||
:returns: A list of the full text of the ``.bbl`` files (if any) \
|
||||
or ``None``.
|
||||
"""
|
||||
@ -311,6 +345,16 @@ def get_bbl(arxiv_id):
|
||||
|
||||
def get_citations(arxiv_id):
|
||||
"""
|
||||
TODO
|
||||
Get the DOIs cited by a given preprint.
|
||||
|
||||
:param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
|
||||
a canonical form.
|
||||
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
||||
"""
|
||||
assert(False)
|
||||
dois = {}
|
||||
# Get the list of bbl files for this preprint
|
||||
bbl_files = get_bbl(arxiv_id)
|
||||
for bbl_file in bbl_files:
|
||||
# Fetch the cited DOIs for each of the bbl files
|
||||
dois.update(bbl.get_cited_DOIs(bbl_file))
|
||||
return dois
|
||||
|
1
libbmc/repositories/hal.py
Normal file
1
libbmc/repositories/hal.py
Normal file
@ -0,0 +1 @@
|
||||
# TODO
|
@ -1,6 +1,12 @@
|
||||
"""
|
||||
This file contains various utility functions.
|
||||
"""
|
||||
import re
|
||||
from itertools import islice, chain
|
||||
|
||||
|
||||
# Huge URL regex taken from https://gist.github.com/gruber/8891611
|
||||
URL_REGEX = re.compile(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))")
|
||||
|
||||
|
||||
def replaceAll(text, replace_dict):
|
||||
@ -19,7 +25,8 @@ def replaceAll(text, replace_dict):
|
||||
|
||||
def clean_whitespaces(text):
|
||||
"""
|
||||
Remove multiple whitespaces from text.
|
||||
Remove multiple whitespaces from text. Also removes leading and trailing \
|
||||
whitespaces.
|
||||
|
||||
:param text: Text to remove multiple whitespaces from.
|
||||
:returns: A cleaned text.
|
||||
@ -35,3 +42,38 @@ def remove_duplicates(some_list):
|
||||
:returns: A list without duplicates.
|
||||
"""
|
||||
return list(set(some_list))
|
||||
|
||||
|
||||
def batch(iterable, size):
|
||||
"""
|
||||
Get items from a sequence a batch at a time.
|
||||
|
||||
.. note:
|
||||
|
||||
Adapted from
|
||||
https://code.activestate.com/recipes/303279-getting-items-in-batches/.
|
||||
|
||||
|
||||
.. note:
|
||||
|
||||
All batches must be exhausted immediately.
|
||||
|
||||
:params iterable: An iterable to get batches from.
|
||||
:params size: Size of the batches.
|
||||
:returns: A new batch of the given size at each time.
|
||||
"""
|
||||
it = iter(iterable)
|
||||
while True:
|
||||
bi = islice(it, size)
|
||||
yield chain([bi.next()], bi)
|
||||
|
||||
|
||||
def remove_URLs(text):
|
||||
"""
|
||||
Remove URLs from a given text (only removes http, https and naked domains \
|
||||
URLs).
|
||||
|
||||
:param text: The text to remove URLs from.
|
||||
:returns: The text without URLs.
|
||||
"""
|
||||
return clean_whitespaces(URL_REGEX.sub("", text))
|
||||
|
@ -1,2 +1,3 @@
|
||||
isbnlib==3.5.7
|
||||
requests==2.9.1
|
||||
arxiv2bib>=1.0.7
|
||||
isbnlib>=3.5.7
|
||||
requests>=2.9.1
|
||||
|
Loading…
Reference in New Issue
Block a user