Passing pylint on the module
This commit is contained in:
parent
a69e7ef6c1
commit
a2ee654eac
@ -1,15 +1,11 @@
|
|||||||
|
"""
|
||||||
|
libbmc
|
||||||
|
|
||||||
|
The :mod:`libbmc` is a generic Python library to manage bibliography and play
|
||||||
|
with scientific papers.
|
||||||
|
"""
|
||||||
|
|
||||||
# Global list of valid paper identifier types. See README.md.
|
# Global list of valid paper identifier types. See README.md.
|
||||||
__valid_identifiers__ = []
|
__valid_identifiers__ = []
|
||||||
|
|
||||||
# Import order of the modules is important, as they will populate
|
|
||||||
# `__valid_identifiers__` on load, and the order in this list reflects their
|
|
||||||
# priority.
|
|
||||||
from libbmc import bibtex, doi, fetcher, isbn # noqa
|
|
||||||
from libbmc import citations, papers, repositories # noqa
|
|
||||||
|
|
||||||
__version__ = "0.1.3.1"
|
__version__ = "0.1.3.1"
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"bibtex", "doi", "fetcher", "isbn",
|
|
||||||
"citations", "papers", "repositories",
|
|
||||||
]
|
|
||||||
|
@ -1,5 +0,0 @@
|
|||||||
from libbmc.citations import bbl, bibtex, pdf, plaintext
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"bbl", "bibtex", "pdf", "plaintext"
|
|
||||||
]
|
|
@ -73,7 +73,7 @@ def get_plaintext_citations(bbl):
|
|||||||
return cleaned_bbl
|
return cleaned_bbl
|
||||||
|
|
||||||
|
|
||||||
def get_cited_DOIs(bbl):
|
def get_cited_dois(bbl):
|
||||||
"""
|
"""
|
||||||
Get the DOIs of the papers cited in a .bbl file.
|
Get the DOIs of the papers cited in a .bbl file.
|
||||||
|
|
||||||
@ -85,4 +85,4 @@ def get_cited_DOIs(bbl):
|
|||||||
# Get the plaintext citations from the bbl file
|
# Get the plaintext citations from the bbl file
|
||||||
plaintext_citations = get_plaintext_citations(bbl)
|
plaintext_citations = get_plaintext_citations(bbl)
|
||||||
# Use the plaintext citations parser on these citations
|
# Use the plaintext citations parser on these citations
|
||||||
return plaintext.get_cited_DOIs(plaintext_citations)
|
return plaintext.get_cited_dois(plaintext_citations)
|
||||||
|
@ -2,15 +2,20 @@
|
|||||||
This files contains all the functions to extract DOIs of citations from
|
This files contains all the functions to extract DOIs of citations from
|
||||||
BibTeX files.
|
BibTeX files.
|
||||||
"""
|
"""
|
||||||
import bibtexparser
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
import bibtexparser
|
||||||
|
|
||||||
from bibtexparser.bparser import BibTexParser
|
from bibtexparser.bparser import BibTexParser
|
||||||
from bibtexparser.customization import convert_to_unicode
|
from bibtexparser.customization import convert_to_unicode
|
||||||
|
|
||||||
|
|
||||||
from libbmc import tools
|
from libbmc import tools
|
||||||
from libbmc.citations import plaintext
|
from libbmc.citations import plaintext
|
||||||
|
|
||||||
|
# TODO: Use beta.dissem.in with formatted citation
|
||||||
|
|
||||||
|
|
||||||
def bibentry_as_plaintext(bibentry):
|
def bibentry_as_plaintext(bibentry):
|
||||||
"""
|
"""
|
||||||
@ -51,7 +56,7 @@ def get_plaintext_citations(bibtex):
|
|||||||
return bibentries
|
return bibentries
|
||||||
|
|
||||||
|
|
||||||
def get_cited_DOIs(bibtex):
|
def get_cited_dois(bibtex):
|
||||||
"""
|
"""
|
||||||
Get the DOIs of the papers cited in a BibTeX file.
|
Get the DOIs of the papers cited in a BibTeX file.
|
||||||
|
|
||||||
@ -71,4 +76,4 @@ def get_cited_DOIs(bibtex):
|
|||||||
# Get the plaintext citations from the bibtex file
|
# Get the plaintext citations from the bibtex file
|
||||||
plaintext_citations = get_plaintext_citations(bibtex)
|
plaintext_citations = get_plaintext_citations(bibtex)
|
||||||
# Use the plaintext citations parser on these citations
|
# Use the plaintext citations parser on these citations
|
||||||
return plaintext.get_cited_DOIs(plaintext_citations)
|
return plaintext.get_cited_dois(plaintext_citations)
|
||||||
|
@ -3,10 +3,11 @@ This files contains all the functions to extract DOIs of citations from
|
|||||||
PDF files.
|
PDF files.
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import requests
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
from requests.exceptions import RequestException
|
from requests.exceptions import RequestException
|
||||||
|
|
||||||
from libbmc import tools
|
from libbmc import tools
|
||||||
@ -17,7 +18,7 @@ CERMINE_BASE_URL = "http://cermine.ceon.pl/"
|
|||||||
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
|
||||||
def cermine(pdf_file, force_API=False, override_local=None):
|
def cermine(pdf_file, force_api=False, override_local=None):
|
||||||
"""
|
"""
|
||||||
Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract metadata from \
|
Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract metadata from \
|
||||||
the given PDF file, to retrieve citations (and more) from the \
|
the given PDF file, to retrieve citations (and more) from the \
|
||||||
@ -44,7 +45,7 @@ def cermine(pdf_file, force_API=False, override_local=None):
|
|||||||
the CERMINE API terms.
|
the CERMINE API terms.
|
||||||
|
|
||||||
:param pdf_file: Path to the PDF file to handle.
|
:param pdf_file: Path to the PDF file to handle.
|
||||||
:param force_API: Force the use of the Cermine API \
|
:param force_api: Force the use of the Cermine API \
|
||||||
(and do not try to use a local JAR file). Defaults to ``False``.
|
(and do not try to use a local JAR file). Defaults to ``False``.
|
||||||
:param override_local: Use this specific JAR file, instead of the one at \
|
:param override_local: Use this specific JAR file, instead of the one at \
|
||||||
the default location (``libbmc/external/cermine.jar``).
|
the default location (``libbmc/external/cermine.jar``).
|
||||||
@ -55,23 +56,23 @@ def cermine(pdf_file, force_API=False, override_local=None):
|
|||||||
# Check if we want to load the local JAR from a specific path
|
# Check if we want to load the local JAR from a specific path
|
||||||
local = override_local
|
local = override_local
|
||||||
# Else, try to stat the JAR file at the expected local path
|
# Else, try to stat the JAR file at the expected local path
|
||||||
if (local is None) and (not force_API):
|
if (local is None) and (not force_api):
|
||||||
if os.path.isfile(os.path.join(SCRIPT_DIR,
|
if os.path.isfile(os.path.join(SCRIPT_DIR,
|
||||||
"../external/cermine.jar")):
|
"../external/cermine.jar")):
|
||||||
local = os.path.join(SCRIPT_DIR,
|
local = os.path.join(SCRIPT_DIR,
|
||||||
"../external/cermine.jar")
|
"../external/cermine.jar")
|
||||||
|
|
||||||
# If we want to force the API use, or we could not get a local JAR
|
# If we want to force the API use, or we could not get a local JAR
|
||||||
if force_API or (local is None):
|
if force_api or (local is None):
|
||||||
print("Using API")
|
print("Using API")
|
||||||
with open(pdf_file, "rb") as fh:
|
with open(pdf_file, "rb") as fh:
|
||||||
# Query the API
|
# Query the API
|
||||||
r = requests.post(
|
request = requests.post(
|
||||||
CERMINE_BASE_URL + "extract.do",
|
CERMINE_BASE_URL + "extract.do",
|
||||||
headers={"Content-Type": "application/binary"},
|
headers={"Content-Type": "application/binary"},
|
||||||
files={"file": fh}
|
files={"file": fh}
|
||||||
)
|
)
|
||||||
return r.text
|
return request.text
|
||||||
# Else, use the local JAR file
|
# Else, use the local JAR file
|
||||||
else:
|
else:
|
||||||
return subprocess.check_output([
|
return subprocess.check_output([
|
||||||
@ -86,7 +87,7 @@ def cermine(pdf_file, force_API=False, override_local=None):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def cermine_dois(pdf_file, force_API=False, override_local=None):
|
def cermine_dois(pdf_file, force_api=False, override_local=None):
|
||||||
"""
|
"""
|
||||||
Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract DOIs of cited \
|
Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract DOIs of cited \
|
||||||
papers from a PDF file.
|
papers from a PDF file.
|
||||||
@ -116,7 +117,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
|
|||||||
try to match them on Crossref to get DOIs.
|
try to match them on Crossref to get DOIs.
|
||||||
|
|
||||||
:param pdf_file: Path to the PDF file to handle.
|
:param pdf_file: Path to the PDF file to handle.
|
||||||
:param force_API: Force the use of the Cermine API \
|
:param force_api: Force the use of the Cermine API \
|
||||||
(and do not try to use a local JAR file). Defaults to ``False``.
|
(and do not try to use a local JAR file). Defaults to ``False``.
|
||||||
:param override_local: Use this specific JAR file, instead of the one at \
|
:param override_local: Use this specific JAR file, instead of the one at \
|
||||||
the default location (``libbmc/external/cermine.jar``).
|
the default location (``libbmc/external/cermine.jar``).
|
||||||
@ -126,7 +127,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
|
|||||||
# * Do not convert to plain text, but use the extra metadata from
|
# * Do not convert to plain text, but use the extra metadata from
|
||||||
# CERMINE
|
# CERMINE
|
||||||
# Call CERMINE on the PDF file
|
# Call CERMINE on the PDF file
|
||||||
cermine_output = cermine(pdf_file, force_API, override_local)
|
cermine_output = cermine(pdf_file, force_api, override_local)
|
||||||
# Parse the resulting XML
|
# Parse the resulting XML
|
||||||
root = ET.fromstring(cermine_output)
|
root = ET.fromstring(cermine_output)
|
||||||
plaintext_references = [
|
plaintext_references = [
|
||||||
@ -136,7 +137,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
|
|||||||
ET.tostring(e, method="text").decode("utf-8").replace(e.text, ""))
|
ET.tostring(e, method="text").decode("utf-8").replace(e.text, ""))
|
||||||
for e in root.iter("mixed-citation")]
|
for e in root.iter("mixed-citation")]
|
||||||
# Call the plaintext methods to fetch DOIs
|
# Call the plaintext methods to fetch DOIs
|
||||||
return plaintext.get_cited_DOIs(plaintext_references)
|
return plaintext.get_cited_dois(plaintext_references)
|
||||||
|
|
||||||
|
|
||||||
def grobid(pdf_folder, grobid_home=None, grobid_jar=None):
|
def grobid(pdf_folder, grobid_home=None, grobid_jar=None):
|
||||||
@ -156,6 +157,8 @@ def grobid(pdf_folder, grobid_home=None, grobid_jar=None):
|
|||||||
:param grobid_jar: Path to the built Grobid JAR file.
|
:param grobid_jar: Path to the built Grobid JAR file.
|
||||||
:returns: ``True``, or ``False`` if an error occurred.
|
:returns: ``True``, or ``False`` if an error occurred.
|
||||||
"""
|
"""
|
||||||
|
# TODO: Should be using https://github.com/kermitt2/grobid-example and
|
||||||
|
# BibTeX backend.
|
||||||
if grobid_home is None or grobid_jar is None:
|
if grobid_home is None or grobid_jar is None:
|
||||||
# User should pass the correct paths
|
# User should pass the correct paths
|
||||||
return False
|
return False
|
||||||
@ -234,4 +237,4 @@ def pdfextract_dois(pdf_file):
|
|||||||
root = ET.fromstring(references)
|
root = ET.fromstring(references)
|
||||||
plaintext_references = [e.text for e in root.iter("reference")]
|
plaintext_references = [e.text for e in root.iter("reference")]
|
||||||
# Call the plaintext methods to fetch DOIs
|
# Call the plaintext methods to fetch DOIs
|
||||||
return plaintext.get_cited_DOIs(plaintext_references)
|
return plaintext.get_cited_dois(plaintext_references)
|
||||||
|
@ -37,7 +37,7 @@ def get_plaintext_citations(file):
|
|||||||
return cleaned_citations
|
return cleaned_citations
|
||||||
|
|
||||||
|
|
||||||
def get_cited_DOIs(file):
|
def get_cited_dois(file):
|
||||||
"""
|
"""
|
||||||
Get the DOIs of the papers cited in a plaintext file. The file should \
|
Get the DOIs of the papers cited in a plaintext file. The file should \
|
||||||
have one citation per line.
|
have one citation per line.
|
||||||
@ -66,29 +66,29 @@ def get_cited_DOIs(file):
|
|||||||
# Try to get the DOI directly from the citation
|
# Try to get the DOI directly from the citation
|
||||||
for citation in plaintext_citations[:]:
|
for citation in plaintext_citations[:]:
|
||||||
# Some citations already contain a DOI so try to match it directly
|
# Some citations already contain a DOI so try to match it directly
|
||||||
matched_DOIs = doi.extract_from_text(citation)
|
matched_dois = doi.extract_from_text(citation)
|
||||||
if len(matched_DOIs) > 0:
|
if len(matched_dois) > 0:
|
||||||
# Add the DOI and go on
|
# Add the DOI and go on
|
||||||
dois[citation] = next(iter(matched_DOIs))
|
dois[citation] = next(iter(matched_dois))
|
||||||
continue
|
continue
|
||||||
# Same thing for arXiv id
|
# Same thing for arXiv id
|
||||||
matched_arXiv = arxiv.extract_from_text(citation)
|
matched_arxiv = arxiv.extract_from_text(citation)
|
||||||
if len(matched_arXiv) > 0:
|
if len(matched_arxiv) > 0:
|
||||||
# Add the associated DOI and go on
|
# Add the associated DOI and go on
|
||||||
dois[citation] = arxiv.to_DOI(next(iter(matched_arXiv)))
|
dois[citation] = arxiv.to_doi(next(iter(matched_arxiv)))
|
||||||
continue
|
continue
|
||||||
# If no match found, stack it for next step
|
# If no match found, stack it for next step
|
||||||
# Note to remove URLs in the citation as the plaintext citations can
|
# Note to remove URLs in the citation as the plaintext citations can
|
||||||
# contain URLs and they are bad for the CrossRef API.
|
# contain URLs and they are bad for the CrossRef API.
|
||||||
crossref_queue.append(tools.remove_URLs(citation))
|
crossref_queue.append(tools.remove_urls(citation))
|
||||||
|
|
||||||
# Do batch with remaining papers, to prevent from the timeout of CrossRef
|
# Do batch with remaining papers, to prevent from the timeout of CrossRef
|
||||||
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
|
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
|
||||||
batch = [i for i in batch]
|
batch = [i for i in batch]
|
||||||
try:
|
try:
|
||||||
# Fetch results from CrossRef
|
# Fetch results from CrossRef
|
||||||
r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
|
request = requests.post(CROSSREF_LINKS_API_URL, json=batch)
|
||||||
for result in r.json()["results"]:
|
for result in request.json()["results"]:
|
||||||
# Try to get a DOI
|
# Try to get a DOI
|
||||||
try:
|
try:
|
||||||
dois[result["text"]] = result["doi"]
|
dois[result["text"]] = result["doi"]
|
||||||
|
@ -55,7 +55,7 @@ def is_valid(doi):
|
|||||||
False
|
False
|
||||||
"""
|
"""
|
||||||
match = REGEX.match(doi)
|
match = REGEX.match(doi)
|
||||||
return ((match is not None) and (match.group(0) == doi))
|
return (match is not None) and (match.group(0) == doi)
|
||||||
|
|
||||||
|
|
||||||
def extract_from_text(text):
|
def extract_from_text(text):
|
||||||
@ -71,17 +71,17 @@ def extract_from_text(text):
|
|||||||
return tools.remove_duplicates(REGEX.findall(text))
|
return tools.remove_duplicates(REGEX.findall(text))
|
||||||
|
|
||||||
|
|
||||||
def to_URL(dois):
|
def to_url(dois):
|
||||||
"""
|
"""
|
||||||
Convert a list of canonical DOIs to a list of DOIs URLs.
|
Convert a list of canonical DOIs to a list of DOIs URLs.
|
||||||
|
|
||||||
:param dois: List of canonical DOIs. Can also be a single canonical DOI.
|
:param dois: List of canonical DOIs. Can also be a single canonical DOI.
|
||||||
:returns: A list of DOIs URLs (resp. a single value).
|
:returns: A list of DOIs URLs (resp. a single value).
|
||||||
|
|
||||||
>>> to_URL(['10.1209/0295-5075/111/40005'])
|
>>> to_url(['10.1209/0295-5075/111/40005'])
|
||||||
['http://dx.doi.org/10.1209/0295-5075/111/40005']
|
['http://dx.doi.org/10.1209/0295-5075/111/40005']
|
||||||
|
|
||||||
>>> to_URL('10.1209/0295-5075/111/40005')
|
>>> to_url('10.1209/0295-5075/111/40005')
|
||||||
'http://dx.doi.org/10.1209/0295-5075/111/40005'
|
'http://dx.doi.org/10.1209/0295-5075/111/40005'
|
||||||
"""
|
"""
|
||||||
if isinstance(dois, list):
|
if isinstance(dois, list):
|
||||||
@ -110,13 +110,7 @@ def to_canonical(urls):
|
|||||||
>>> to_canonical(['aaaa']) is None
|
>>> to_canonical(['aaaa']) is None
|
||||||
True
|
True
|
||||||
"""
|
"""
|
||||||
try:
|
return tools.map_or_apply(extract_from_text, urls)
|
||||||
if isinstance(urls, list):
|
|
||||||
return [next(iter(extract_from_text(url))) for url in urls]
|
|
||||||
else:
|
|
||||||
return next(iter(extract_from_text(urls)))
|
|
||||||
except StopIteration:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def get_oa_version(doi):
|
def get_oa_version(doi):
|
||||||
@ -134,10 +128,10 @@ def get_oa_version(doi):
|
|||||||
'http://arxiv.org/abs/1506.06690'
|
'http://arxiv.org/abs/1506.06690'
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
r = requests.get("%s%s" % (DISSEMIN_API, doi))
|
request = requests.get("%s%s" % (DISSEMIN_API, doi))
|
||||||
r.raise_for_status()
|
request.raise_for_status()
|
||||||
result = r.json()
|
result = request.json()
|
||||||
assert(result["status"] == "ok")
|
assert result["status"] == "ok"
|
||||||
return result["paper"]["pdf_url"]
|
return result["paper"]["pdf_url"]
|
||||||
except (AssertionError, ValueError, KeyError, RequestException):
|
except (AssertionError, ValueError, KeyError, RequestException):
|
||||||
return None
|
return None
|
||||||
@ -162,10 +156,10 @@ def get_oa_policy(doi):
|
|||||||
True
|
True
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
r = requests.get("%s%s" % (DISSEMIN_API, doi))
|
request = requests.get("%s%s" % (DISSEMIN_API, doi))
|
||||||
r.raise_for_status()
|
request.raise_for_status()
|
||||||
result = r.json()
|
result = request.json()
|
||||||
assert(result["status"] == "ok")
|
assert result["status"] == "ok"
|
||||||
return ([i
|
return ([i
|
||||||
for i in result["paper"]["publications"]
|
for i in result["paper"]["publications"]
|
||||||
if i["doi"] == doi][0])["policy"]
|
if i["doi"] == doi][0])["policy"]
|
||||||
@ -185,8 +179,8 @@ def get_linked_version(doi):
|
|||||||
'http://stacks.iop.org/0295-5075/111/i=4/a=40005?key=crossref.9ad851948a976ecdf216d4929b0b6f01'
|
'http://stacks.iop.org/0295-5075/111/i=4/a=40005?key=crossref.9ad851948a976ecdf216d4929b0b6f01'
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
r = requests.head(to_URL(doi))
|
request = requests.head(to_url(doi))
|
||||||
return r.headers.get("location")
|
return request.headers.get("location")
|
||||||
except RequestException:
|
except RequestException:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -206,10 +200,10 @@ def get_bibtex(doi):
|
|||||||
'@article{Verney_2015,\\n\\tdoi = {10.1209/0295-5075/111/40005},\\n\\turl = {http://dx.doi.org/10.1209/0295-5075/111/40005},\\n\\tyear = 2015,\\n\\tmonth = {aug},\\n\\tpublisher = {{IOP} Publishing},\\n\\tvolume = {111},\\n\\tnumber = {4},\\n\\tpages = {40005},\\n\\tauthor = {Lucas Verney and Lev Pitaevskii and Sandro Stringari},\\n\\ttitle = {Hybridization of first and second sound in a weakly interacting Bose gas},\\n\\tjournal = {{EPL}}\\n}'
|
'@article{Verney_2015,\\n\\tdoi = {10.1209/0295-5075/111/40005},\\n\\turl = {http://dx.doi.org/10.1209/0295-5075/111/40005},\\n\\tyear = 2015,\\n\\tmonth = {aug},\\n\\tpublisher = {{IOP} Publishing},\\n\\tvolume = {111},\\n\\tnumber = {4},\\n\\tpages = {40005},\\n\\tauthor = {Lucas Verney and Lev Pitaevskii and Sandro Stringari},\\n\\ttitle = {Hybridization of first and second sound in a weakly interacting Bose gas},\\n\\tjournal = {{EPL}}\\n}'
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
r = requests.get(to_URL(doi),
|
request = requests.get(to_url(doi),
|
||||||
headers={"accept": "application/x-bibtex"})
|
headers={"accept": "application/x-bibtex"})
|
||||||
r.raise_for_status()
|
request.raise_for_status()
|
||||||
assert(r.headers.get("content-type") == "application/x-bibtex")
|
assert request.headers.get("content-type") == "application/x-bibtex"
|
||||||
return r.text
|
return request.text
|
||||||
except (RequestException, AssertionError):
|
except (RequestException, AssertionError):
|
||||||
return None
|
return None
|
||||||
|
@ -3,24 +3,89 @@ This file contains functions to download locally some papers, eventually using
|
|||||||
a proxy.
|
a proxy.
|
||||||
"""
|
"""
|
||||||
import socket
|
import socket
|
||||||
import socks
|
|
||||||
import sys
|
import sys
|
||||||
import urllib
|
import urllib
|
||||||
|
|
||||||
|
import socks
|
||||||
|
|
||||||
|
|
||||||
# Default socket to use, if no proxy is used
|
# Default socket to use, if no proxy is used
|
||||||
DEFAULT_SOCKET = socket.socket
|
DEFAULT_SOCKET = socket.socket
|
||||||
|
|
||||||
|
|
||||||
def download(url, proxies=[None]):
|
def _download_helper(url):
|
||||||
|
"""
|
||||||
|
Handle the download of an URL, using the proxy currently set in \
|
||||||
|
:mod:`socks`.
|
||||||
|
|
||||||
|
:param url: The URL to download.
|
||||||
|
:returns: A tuple of the raw content of the downloaded data and its \
|
||||||
|
associated content-type. Returns None if it was \
|
||||||
|
unable to download the document.
|
||||||
|
"""
|
||||||
|
# Try to fetch the URL using the current proxy
|
||||||
|
try:
|
||||||
|
request = urllib.request.urlopen(url)
|
||||||
|
try:
|
||||||
|
size = int(dict(request.info())['content-length'].strip())
|
||||||
|
except KeyError:
|
||||||
|
try:
|
||||||
|
size = int(dict(request.info())['Content-Length'].strip())
|
||||||
|
except KeyError:
|
||||||
|
size = 0
|
||||||
|
# Download the document
|
||||||
|
doc = b""
|
||||||
|
doc_size = 0
|
||||||
|
while True:
|
||||||
|
buf = request.read(1024)
|
||||||
|
if buf:
|
||||||
|
doc += buf
|
||||||
|
doc_size += len(buf)
|
||||||
|
if size != 0:
|
||||||
|
# Write progress bar on stdout
|
||||||
|
done = int(50 * doc_size / size)
|
||||||
|
sys.stdout.write("\r[%s%s]" %
|
||||||
|
('='*done, ' '*(50-done)))
|
||||||
|
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
|
||||||
|
sys.stdout.flush()
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
# Fetch content type
|
||||||
|
contenttype = None
|
||||||
|
contenttype_req = None
|
||||||
|
try:
|
||||||
|
contenttype_req = dict(request.info())['content-type']
|
||||||
|
except KeyError:
|
||||||
|
try:
|
||||||
|
contenttype_req = dict(request.info())['Content-Type']
|
||||||
|
except KeyError:
|
||||||
|
return None
|
||||||
|
if 'pdf' in contenttype_req:
|
||||||
|
contenttype = 'pdf'
|
||||||
|
elif 'djvu' in contenttype_req:
|
||||||
|
contenttype = 'djvu'
|
||||||
|
|
||||||
|
# Check content type and status code are ok
|
||||||
|
if request.getcode() != 200 or contenttype is None:
|
||||||
|
# Else, try with the next available proxy
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Return a tuple of the downloaded content and the content-type
|
||||||
|
return (doc, contenttype)
|
||||||
|
# If an exception occurred, continue with next available proxy
|
||||||
|
except (urllib.error.URLError, socket.error, ValueError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def download(url, proxies=None):
|
||||||
"""
|
"""
|
||||||
Download a PDF or DJVU document from a url, eventually using proxies.
|
Download a PDF or DJVU document from a url, eventually using proxies.
|
||||||
|
|
||||||
:params url: The URL to the PDF/DJVU document to fetch.
|
:params url: The URL to the PDF/DJVU document to fetch.
|
||||||
:params proxies: An optional list of proxies to use. Proxies will be \
|
:params proxies: An optional list of proxies to use. Proxies will be \
|
||||||
used sequentially. Proxies should be a list of proxy strings. \
|
used sequentially. Proxies should be a list of proxy strings. \
|
||||||
Do not forget to include ``None`` in the list if you want to try \
|
Do not forget to include ``""`` (empty string) in the list if \
|
||||||
direct fetching without any proxy.
|
you want to try direct fetching without any proxy.
|
||||||
|
|
||||||
:returns: A tuple of the raw content of the downloaded data and its \
|
:returns: A tuple of the raw content of the downloaded data and its \
|
||||||
associated content-type. Returns ``(None, None)`` if it was \
|
associated content-type. Returns ``(None, None)`` if it was \
|
||||||
@ -28,10 +93,14 @@ def download(url, proxies=[None]):
|
|||||||
|
|
||||||
>>> download("http://arxiv.org/pdf/1312.4006.pdf") # doctest: +SKIP
|
>>> download("http://arxiv.org/pdf/1312.4006.pdf") # doctest: +SKIP
|
||||||
"""
|
"""
|
||||||
|
# Handle default argument
|
||||||
|
if proxies is None:
|
||||||
|
proxies = [""]
|
||||||
|
|
||||||
# Loop over all available connections
|
# Loop over all available connections
|
||||||
for proxy in proxies:
|
for proxy in proxies:
|
||||||
# Handle no proxy case
|
# Handle no proxy case
|
||||||
if proxy is None:
|
if proxy == "":
|
||||||
socket.socket = DEFAULT_SOCKET
|
socket.socket = DEFAULT_SOCKET
|
||||||
# Handle SOCKS proxy
|
# Handle SOCKS proxy
|
||||||
elif proxy.startswith('socks'):
|
elif proxy.startswith('socks'):
|
||||||
@ -55,58 +124,9 @@ def download(url, proxies=[None]):
|
|||||||
socks.set_default_proxy(socks.HTTP, proxy, port)
|
socks.set_default_proxy(socks.HTTP, proxy, port)
|
||||||
socket.socket = socks.socksocket
|
socket.socket = socks.socksocket
|
||||||
|
|
||||||
# Try to fetch the URL using the current proxy
|
downloaded = _download_helper(url)
|
||||||
try:
|
if downloaded is not None:
|
||||||
r = urllib.request.urlopen(url)
|
return downloaded
|
||||||
try:
|
|
||||||
size = int(dict(r.info())['content-length'].strip())
|
|
||||||
except KeyError:
|
|
||||||
try:
|
|
||||||
size = int(dict(r.info())['Content-Length'].strip())
|
|
||||||
except KeyError:
|
|
||||||
size = 0
|
|
||||||
# Download the document
|
|
||||||
dl = b""
|
|
||||||
dl_size = 0
|
|
||||||
while True:
|
|
||||||
buf = r.read(1024)
|
|
||||||
if buf:
|
|
||||||
dl += buf
|
|
||||||
dl_size += len(buf)
|
|
||||||
if size != 0:
|
|
||||||
# Write progress bar on stdout
|
|
||||||
done = int(50 * dl_size / size)
|
|
||||||
sys.stdout.write("\r[%s%s]" %
|
|
||||||
('='*done, ' '*(50-done)))
|
|
||||||
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
|
|
||||||
sys.stdout.flush()
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
# Fetch content type
|
|
||||||
contenttype = False
|
|
||||||
contenttype_req = None
|
|
||||||
try:
|
|
||||||
contenttype_req = dict(r.info())['content-type']
|
|
||||||
except KeyError:
|
|
||||||
try:
|
|
||||||
contenttype_req = dict(r.info())['Content-Type']
|
|
||||||
except KeyError:
|
|
||||||
continue
|
|
||||||
if 'pdf' in contenttype_req:
|
|
||||||
contenttype = 'pdf'
|
|
||||||
elif 'djvu' in contenttype_req:
|
|
||||||
contenttype = 'djvu'
|
|
||||||
|
|
||||||
# Check content type and status code are ok
|
|
||||||
if r.getcode() != 200 or contenttype is False:
|
|
||||||
# Else, try with the next available proxy
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Return a tuple of the downloaded content and the content-type
|
|
||||||
return (dl, contenttype)
|
|
||||||
# If an exception occurred, continue with next available proxy
|
|
||||||
except (urllib.error.URLError, socket.error, ValueError):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# In case of running out of proxies, return (None, None)
|
# In case of running out of proxies, return (None, None)
|
||||||
return (None, None)
|
return (None, None)
|
||||||
|
@ -11,11 +11,11 @@ from libbmc import __valid_identifiers__
|
|||||||
__valid_identifiers__ += ["isbn"]
|
__valid_identifiers__ += ["isbn"]
|
||||||
|
|
||||||
|
|
||||||
def is_valid(isbn):
|
def is_valid(isbn_id):
|
||||||
"""
|
"""
|
||||||
Check that a given string is a valid ISBN.
|
Check that a given string is a valid ISBN.
|
||||||
|
|
||||||
:param isbn: the isbn to be checked.
|
:param isbn_id: the isbn to be checked.
|
||||||
:returns: boolean indicating whether the isbn is valid or not.
|
:returns: boolean indicating whether the isbn is valid or not.
|
||||||
|
|
||||||
>>> is_valid("978-3-16-148410-0")
|
>>> is_valid("978-3-16-148410-0")
|
||||||
@ -43,9 +43,9 @@ def is_valid(isbn):
|
|||||||
True
|
True
|
||||||
"""
|
"""
|
||||||
return (
|
return (
|
||||||
(not isbnlib.notisbn(isbn)) and (
|
(not isbnlib.notisbn(isbn_id)) and (
|
||||||
isbnlib.get_canonical_isbn(isbn) == isbn or
|
isbnlib.get_canonical_isbn(isbn_id) == isbn_id or
|
||||||
isbnlib.mask(isbnlib.get_canonical_isbn(isbn)) == isbn)
|
isbnlib.mask(isbnlib.get_canonical_isbn(isbn_id)) == isbn_id)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -64,28 +64,28 @@ def extract_from_text(text):
|
|||||||
return [i for i in isbns if i is not None]
|
return [i for i in isbns if i is not None]
|
||||||
|
|
||||||
|
|
||||||
def get_bibtex(isbn):
|
def get_bibtex(isbn_identifier):
|
||||||
"""
|
"""
|
||||||
Get a BibTeX string for the given ISBN.
|
Get a BibTeX string for the given ISBN.
|
||||||
|
|
||||||
:param isbn: ISBN to fetch BibTeX entry for.
|
:param isbn_identifier: ISBN to fetch BibTeX entry for.
|
||||||
:returns: A BibTeX string or ``None`` if could not fetch it.
|
:returns: A BibTeX string or ``None`` if could not fetch it.
|
||||||
|
|
||||||
>>> get_bibtex('9783161484100')
|
>>> get_bibtex('9783161484100')
|
||||||
'@book{9783161484100,\\n title = {Berkeley, Oakland: Albany, Emeryville, Alameda, Kensington},\\n author = {Peekaboo Maps},\\n isbn = {9783161484100},\\n year = {2009},\\n publisher = {Peek A Boo Maps}\\n}'
|
'@book{9783161484100,\\n title = {Berkeley, Oakland: Albany, Emeryville, Alameda, Kensington},\\n author = {Peekaboo Maps},\\n isbn = {9783161484100},\\n year = {2009},\\n publisher = {Peek A Boo Maps}\\n}'
|
||||||
"""
|
"""
|
||||||
# Try to find the BibTeX using associated DOIs
|
# Try to find the BibTeX using associated DOIs
|
||||||
bibtex = doi.get_bibtex(to_DOI(isbn))
|
bibtex = doi.get_bibtex(to_doi(isbn_identifier))
|
||||||
if bibtex is None:
|
if bibtex is None:
|
||||||
# In some cases, there are no DOIs for a given ISBN. In this case, try
|
# In some cases, there are no DOIs for a given ISBN. In this case, try
|
||||||
# to fetch bibtex directly from the ISBN, using a combination of
|
# to fetch bibtex directly from the ISBN, using a combination of
|
||||||
# Google Books and worldcat.org results.
|
# Google Books and worldcat.org results.
|
||||||
bibtex = isbnlib.registry.bibformatters['bibtex'](
|
bibtex = isbnlib.registry.bibformatters['bibtex'](
|
||||||
isbnlib.meta(isbn, 'default'))
|
isbnlib.meta(isbn_identifier, 'default'))
|
||||||
return bibtex
|
return bibtex
|
||||||
|
|
||||||
|
|
||||||
def to_DOI(isbn):
|
def to_doi(isbn_identifier):
|
||||||
"""
|
"""
|
||||||
Make a DOI out of the given ISBN.
|
Make a DOI out of the given ISBN.
|
||||||
|
|
||||||
@ -94,16 +94,16 @@ def to_DOI(isbn):
|
|||||||
See https://github.com/xlcnd/isbnlib#note. The returned DOI may not be
|
See https://github.com/xlcnd/isbnlib#note. The returned DOI may not be
|
||||||
issued yet.
|
issued yet.
|
||||||
|
|
||||||
:param isbn: A valid ISBN string.
|
:param isbn_identifier: A valid ISBN string.
|
||||||
:returns: A DOI as string.
|
:returns: A DOI as string.
|
||||||
|
|
||||||
>>> to_DOI('9783161484100')
|
>>> to_doi('9783161484100')
|
||||||
'10.978.316/1484100'
|
'10.978.316/1484100'
|
||||||
"""
|
"""
|
||||||
return isbnlib.doi(isbn)
|
return isbnlib.doi(isbn_identifier)
|
||||||
|
|
||||||
|
|
||||||
def from_DOI(doi):
|
def from_doi(doi_identifier):
|
||||||
"""
|
"""
|
||||||
Make an ISBN out of the given DOI.
|
Make an ISBN out of the given DOI.
|
||||||
|
|
||||||
@ -119,10 +119,10 @@ def from_DOI(doi):
|
|||||||
issued yet (it is a valid one, but not necessary corresponding to a
|
issued yet (it is a valid one, but not necessary corresponding to a
|
||||||
valid book).
|
valid book).
|
||||||
|
|
||||||
:param doi: A valid canonical DOI.
|
:param doi_identifier: A valid canonical DOI.
|
||||||
:returns: An ISBN string.
|
:returns: An ISBN string.
|
||||||
|
|
||||||
>>> from_DOI('10.978.316/1484100')
|
>>> from_doi('10.978.316/1484100')
|
||||||
'9783161484100'
|
'9783161484100'
|
||||||
"""
|
"""
|
||||||
return "".join(c for c in doi[2:] if c in "0123456789xX")
|
return "".join(c for c in doi_identifier[2:] if c in "0123456789xX")
|
||||||
|
@ -1,6 +0,0 @@
|
|||||||
from libbmc.papers import identifiers
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"identifiers",
|
|
||||||
"tearpages"
|
|
||||||
]
|
|
@ -13,8 +13,8 @@ import sys
|
|||||||
from libbmc import __valid_identifiers__
|
from libbmc import __valid_identifiers__
|
||||||
|
|
||||||
# Import all the modules associated to __valid_identifiers__
|
# Import all the modules associated to __valid_identifiers__
|
||||||
for type in __valid_identifiers__:
|
for valid_identifier in __valid_identifiers__:
|
||||||
importlib.import_module("libbmc.%s" % (type,))
|
importlib.import_module("libbmc.%s" % (valid_identifier,))
|
||||||
|
|
||||||
|
|
||||||
def find_identifiers(src):
|
def find_identifiers(src):
|
||||||
@ -53,18 +53,19 @@ def find_identifiers(src):
|
|||||||
|
|
||||||
while totext.poll() is None:
|
while totext.poll() is None:
|
||||||
extract_full = ' '.join([i.decode("utf-8").strip()
|
extract_full = ' '.join([i.decode("utf-8").strip()
|
||||||
for i in totext.stdout.readlines()])
|
for i in totext.stdout.readlines()])
|
||||||
# Loop over all the valid identifier types
|
# Loop over all the valid identifier types
|
||||||
for type in __valid_identifiers__:
|
for identifier in __valid_identifiers__:
|
||||||
# Dynamically call the ``extract_from_text`` method for the
|
# Dynamically call the ``extract_from_text`` method for the
|
||||||
# associated module.
|
# associated module.
|
||||||
m = sys.modules.get("libbmc.%s" % (type,), None)
|
module = sys.modules.get("libbmc.%s" % (identifier,), None)
|
||||||
if m is None:
|
if module is None:
|
||||||
continue
|
continue
|
||||||
found_id = getattr(m, "extract_from_text")(extract_full)
|
found_id = getattr(module, "extract_from_text")(extract_full)
|
||||||
if found_id:
|
if found_id:
|
||||||
totext.terminate()
|
totext.terminate()
|
||||||
return (type, found_id[0]) # found_id is a list of found IDs
|
# found_id is a list of found IDs
|
||||||
|
return (identifier, found_id[0])
|
||||||
return (None, None)
|
return (None, None)
|
||||||
|
|
||||||
|
|
||||||
@ -80,12 +81,12 @@ def get_bibtex(identifier):
|
|||||||
:returns: A BibTeX string or ``None`` if an error occurred.
|
:returns: A BibTeX string or ``None`` if an error occurred.
|
||||||
# TODO: Should return a BiBTeX object?
|
# TODO: Should return a BiBTeX object?
|
||||||
"""
|
"""
|
||||||
type, id = identifier
|
identifier_type, identifier_id = identifier
|
||||||
if type not in __valid_identifiers__:
|
if identifier_type not in __valid_identifiers__:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Dynamically call the ``get_bibtex`` method from the associated module.
|
# Dynamically call the ``get_bibtex`` method from the associated module.
|
||||||
m = sys.modules.get("libbmc.%s" % (type,), None)
|
module = sys.modules.get("libbmc.%s" % (identifier_type,), None)
|
||||||
if m is None:
|
if module is None:
|
||||||
return None
|
return None
|
||||||
return getattr(m, "get_bibtex")(id)
|
return getattr(module, "get_bibtex")(identifier_id)
|
||||||
|
@ -21,7 +21,7 @@ BAD_JOURNALS = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def fixPdf(pdfFile, destination):
|
def fix_pdf(pdf_file, destination):
|
||||||
"""
|
"""
|
||||||
Fix malformed pdf files when data are present after '%%EOF'
|
Fix malformed pdf files when data are present after '%%EOF'
|
||||||
|
|
||||||
@ -33,18 +33,16 @@ def fixPdf(pdfFile, destination):
|
|||||||
:param destination: destination
|
:param destination: destination
|
||||||
"""
|
"""
|
||||||
tmp = tempfile.NamedTemporaryFile()
|
tmp = tempfile.NamedTemporaryFile()
|
||||||
output = open(tmp.name, 'wb')
|
with open(tmp.name, 'wb') as output:
|
||||||
with open(pdfFile, "rb") as fh:
|
with open(pdf_file, "rb") as fh:
|
||||||
with open(pdfFile, "rb") as fh:
|
|
||||||
for line in fh:
|
for line in fh:
|
||||||
output.write(line)
|
output.write(line)
|
||||||
if b'%%EOF' in line:
|
if b'%%EOF' in line:
|
||||||
break
|
break
|
||||||
output.close()
|
|
||||||
shutil.copy(tmp.name, destination)
|
shutil.copy(tmp.name, destination)
|
||||||
|
|
||||||
|
|
||||||
def tearpage_backend(filename, teared_pages=[0]):
|
def tearpage_backend(filename, teared_pages=None):
|
||||||
"""
|
"""
|
||||||
Copy filename to a tempfile, write pages to filename except the teared one.
|
Copy filename to a tempfile, write pages to filename except the teared one.
|
||||||
|
|
||||||
@ -56,29 +54,35 @@ def tearpage_backend(filename, teared_pages=[0]):
|
|||||||
:param teared_pages: Numbers of the pages to tear. Default to first page \
|
:param teared_pages: Numbers of the pages to tear. Default to first page \
|
||||||
only.
|
only.
|
||||||
"""
|
"""
|
||||||
|
# Handle default argument
|
||||||
|
if teared_pages is None:
|
||||||
|
teared_pages = [0]
|
||||||
|
|
||||||
# Copy the pdf to a tmp file
|
# Copy the pdf to a tmp file
|
||||||
tmp = tempfile.NamedTemporaryFile()
|
with tempfile.NamedTemporaryFile() as tmp:
|
||||||
shutil.copy(filename, tmp.name)
|
# Copy the input file to tmp
|
||||||
|
shutil.copy(filename, tmp.name)
|
||||||
|
|
||||||
# Read the copied pdf
|
# Read the copied pdf
|
||||||
try:
|
# TODO: Use with syntax
|
||||||
input_file = PdfFileReader(open(tmp.name, 'rb'))
|
try:
|
||||||
except PdfReadError:
|
input_file = PdfFileReader(open(tmp.name, 'rb'))
|
||||||
fixPdf(filename, tmp.name)
|
except PdfReadError:
|
||||||
input_file = PdfFileReader(open(tmp.name, 'rb'))
|
fix_pdf(filename, tmp.name)
|
||||||
# Seek for the number of pages
|
input_file = PdfFileReader(open(tmp.name, 'rb'))
|
||||||
num_pages = input_file.getNumPages()
|
# Seek for the number of pages
|
||||||
|
num_pages = input_file.getNumPages()
|
||||||
|
|
||||||
# Write pages excepted the first one
|
# Write pages excepted the first one
|
||||||
output_file = PdfFileWriter()
|
output_file = PdfFileWriter()
|
||||||
for i in range(num_pages):
|
for i in range(num_pages):
|
||||||
if i in teared_pages:
|
if i in teared_pages:
|
||||||
continue
|
continue
|
||||||
output_file.addPage(input_file.getPage(i))
|
output_file.addPage(input_file.getPage(i))
|
||||||
|
|
||||||
tmp.close()
|
tmp.close()
|
||||||
outputStream = open(filename, "wb")
|
outputStream = open(filename, "wb")
|
||||||
output_file.write(outputStream)
|
output_file.write(outputStream)
|
||||||
|
|
||||||
|
|
||||||
def tearpage_needed(bibtex):
|
def tearpage_needed(bibtex):
|
||||||
@ -89,16 +93,16 @@ def tearpage_needed(bibtex):
|
|||||||
whether tearing is needed.
|
whether tearing is needed.
|
||||||
:returns: A list of pages to tear.
|
:returns: A list of pages to tear.
|
||||||
"""
|
"""
|
||||||
for p in BAD_JOURNALS:
|
for publisher in BAD_JOURNALS:
|
||||||
if p in bibtex.get("journal", "").lower():
|
if publisher in bibtex.get("journal", "").lower():
|
||||||
# Bad journal is found, add pages to tear
|
# Bad journal is found, add pages to tear
|
||||||
return BAD_JOURNALS[p]
|
return BAD_JOURNALS[publisher]
|
||||||
|
|
||||||
# If no bad journals are found, return an empty list
|
# If no bad journals are found, return an empty list
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
def tearpage(filename, bibtex=None, force=False):
|
def tearpage(filename, bibtex=None, force=None):
|
||||||
"""
|
"""
|
||||||
Tear some pages of the file if needed.
|
Tear some pages of the file if needed.
|
||||||
|
|
||||||
@ -112,7 +116,7 @@ def tearpage(filename, bibtex=None, force=False):
|
|||||||
"""
|
"""
|
||||||
# Fetch pages to tear
|
# Fetch pages to tear
|
||||||
pages_to_tear = []
|
pages_to_tear = []
|
||||||
if force is not False:
|
if force is not None:
|
||||||
pages_to_tear = force
|
pages_to_tear = force
|
||||||
elif bibtex is not None:
|
elif bibtex is not None:
|
||||||
pages_to_tear = tearpage_needed(bibtex)
|
pages_to_tear = tearpage_needed(bibtex)
|
||||||
|
@ -1,5 +0,0 @@
|
|||||||
from libbmc.repositories import arxiv, hal
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"arxiv", "hal"
|
|
||||||
]
|
|
@ -1,15 +1,18 @@
|
|||||||
"""
|
"""
|
||||||
This file contains all the arXiv-related functions.
|
This file contains all the arXiv-related functions.
|
||||||
"""
|
"""
|
||||||
import arxiv2bib
|
|
||||||
import bibtexparser
|
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
import requests
|
|
||||||
import tarfile
|
import tarfile
|
||||||
import xml.etree.ElementTree
|
import xml.etree.ElementTree
|
||||||
|
|
||||||
from urllib.error import HTTPError
|
from urllib.error import HTTPError
|
||||||
|
|
||||||
|
|
||||||
|
import arxiv2bib
|
||||||
|
import bibtexparser
|
||||||
|
import requests
|
||||||
|
|
||||||
from requests.exceptions import RequestException
|
from requests.exceptions import RequestException
|
||||||
|
|
||||||
|
|
||||||
@ -268,7 +271,7 @@ def is_valid(arxiv_id):
|
|||||||
False
|
False
|
||||||
"""
|
"""
|
||||||
match = REGEX.match(arxiv_id)
|
match = REGEX.match(arxiv_id)
|
||||||
return ((match is not None) and (match.group(0) == arxiv_id))
|
return (match is not None) and (match.group(0) == arxiv_id)
|
||||||
|
|
||||||
|
|
||||||
def get_bibtex(arxiv_id):
|
def get_bibtex(arxiv_id):
|
||||||
@ -320,17 +323,17 @@ def extract_from_text(text):
|
|||||||
for i in REGEX.findall(text) if i[0] != ''])
|
for i in REGEX.findall(text) if i[0] != ''])
|
||||||
|
|
||||||
|
|
||||||
def to_URL(arxiv_ids):
|
def to_url(arxiv_ids):
|
||||||
"""
|
"""
|
||||||
Convert a list of canonical DOIs to a list of DOIs URLs.
|
Convert a list of canonical DOIs to a list of DOIs URLs.
|
||||||
|
|
||||||
:param dois: List of canonical DOIs.
|
:param dois: List of canonical DOIs.
|
||||||
:returns: A list of DOIs URLs.
|
:returns: A list of DOIs URLs.
|
||||||
|
|
||||||
>>> to_URL('1506.06690')
|
>>> to_url('1506.06690')
|
||||||
'http://arxiv.org/abs/1506.06690'
|
'http://arxiv.org/abs/1506.06690'
|
||||||
|
|
||||||
>>> to_URL('1506.06690v1')
|
>>> to_url('1506.06690v1')
|
||||||
'http://arxiv.org/abs/1506.06690v1'
|
'http://arxiv.org/abs/1506.06690v1'
|
||||||
"""
|
"""
|
||||||
if isinstance(arxiv_ids, list):
|
if isinstance(arxiv_ids, list):
|
||||||
@ -358,16 +361,10 @@ def to_canonical(urls):
|
|||||||
>>> to_canonical('aaa') is None
|
>>> to_canonical('aaa') is None
|
||||||
True
|
True
|
||||||
"""
|
"""
|
||||||
try:
|
return tools.map_or_apply(extract_from_text, urls)
|
||||||
if isinstance(urls, list):
|
|
||||||
return [next(iter(extract_from_text(url))) for url in urls]
|
|
||||||
else:
|
|
||||||
return next(iter(extract_from_text(urls)))
|
|
||||||
except StopIteration:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def from_DOI(doi):
|
def from_doi(doi):
|
||||||
"""
|
"""
|
||||||
Get the arXiv eprint id for a given DOI.
|
Get the arXiv eprint id for a given DOI.
|
||||||
|
|
||||||
@ -379,29 +376,29 @@ def from_DOI(doi):
|
|||||||
:param doi: The DOI of the resource to look for.
|
:param doi: The DOI of the resource to look for.
|
||||||
:returns: The arXiv eprint id, or ``None`` if not found.
|
:returns: The arXiv eprint id, or ``None`` if not found.
|
||||||
|
|
||||||
>>> from_DOI('10.1209/0295-5075/111/40005')
|
>>> from_doi('10.1209/0295-5075/111/40005')
|
||||||
# Note: Test do not pass due to an arXiv API bug.
|
# Note: Test do not pass due to an arXiv API bug.
|
||||||
'1506.06690'
|
'1506.06690'
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
r = requests.get("http://export.arxiv.org/api/query",
|
request = requests.get("http://export.arxiv.org/api/query",
|
||||||
params={
|
params={
|
||||||
"search_query": "doi:%s" % (doi,),
|
"search_query": "doi:%s" % (doi,),
|
||||||
"max_results": 1
|
"max_results": 1
|
||||||
})
|
})
|
||||||
r.raise_for_status()
|
request.raise_for_status()
|
||||||
except RequestException:
|
except RequestException:
|
||||||
return None
|
return None
|
||||||
e = xml.etree.ElementTree.fromstring(r.content)
|
root = xml.etree.ElementTree.fromstring(request.content)
|
||||||
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
for entry in root.iter("{http://www.w3.org/2005/Atom}entry"):
|
||||||
id = entry.find("{http://www.w3.org/2005/Atom}id").text
|
arxiv_id = entry.find("{http://www.w3.org/2005/Atom}id").text
|
||||||
# id is an arXiv full URL. We only want the id which is the last URL
|
# arxiv_id is an arXiv full URL. We only want the id which is the last
|
||||||
# component.
|
# URL component.
|
||||||
return id.split("/")[-1]
|
return arxiv_id.split("/")[-1]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def to_DOI(arxiv_id):
|
def to_doi(arxiv_id):
|
||||||
"""
|
"""
|
||||||
Get the associated DOI for a given arXiv eprint.
|
Get the associated DOI for a given arXiv eprint.
|
||||||
|
|
||||||
@ -413,23 +410,23 @@ def to_DOI(arxiv_id):
|
|||||||
:param eprint: The arXiv eprint id.
|
:param eprint: The arXiv eprint id.
|
||||||
:returns: The DOI if any, or ``None``.
|
:returns: The DOI if any, or ``None``.
|
||||||
|
|
||||||
>>> to_DOI('1506.06690v1')
|
>>> to_doi('1506.06690v1')
|
||||||
'10.1209/0295-5075/111/40005'
|
'10.1209/0295-5075/111/40005'
|
||||||
|
|
||||||
>>> to_DOI('1506.06690')
|
>>> to_doi('1506.06690')
|
||||||
'10.1209/0295-5075/111/40005'
|
'10.1209/0295-5075/111/40005'
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
r = requests.get("http://export.arxiv.org/api/query",
|
request = requests.get("http://export.arxiv.org/api/query",
|
||||||
params={
|
params={
|
||||||
"id_list": arxiv_id,
|
"id_list": arxiv_id,
|
||||||
"max_results": 1
|
"max_results": 1
|
||||||
})
|
})
|
||||||
r.raise_for_status()
|
request.raise_for_status()
|
||||||
except RequestException:
|
except RequestException:
|
||||||
return None
|
return None
|
||||||
e = xml.etree.ElementTree.fromstring(r.content)
|
root = xml.etree.ElementTree.fromstring(request.content)
|
||||||
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
for entry in root.iter("{http://www.w3.org/2005/Atom}entry"):
|
||||||
doi = entry.find("{http://arxiv.org/schemas/atom}doi")
|
doi = entry.find("{http://arxiv.org/schemas/atom}doi")
|
||||||
if doi is not None:
|
if doi is not None:
|
||||||
return doi.text
|
return doi.text
|
||||||
@ -451,9 +448,9 @@ def get_sources(arxiv_id):
|
|||||||
``None``.
|
``None``.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
r = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))
|
request = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))
|
||||||
r.raise_for_status()
|
request.raise_for_status()
|
||||||
file_object = io.BytesIO(r.content)
|
file_object = io.BytesIO(request.content)
|
||||||
return tarfile.open(fileobj=file_object)
|
return tarfile.open(fileobj=file_object)
|
||||||
except (RequestException, AssertionError, tarfile.TarError):
|
except (RequestException, AssertionError, tarfile.TarError):
|
||||||
return None
|
return None
|
||||||
@ -473,9 +470,9 @@ def get_bbl(arxiv_id):
|
|||||||
:returns: A list of the full text of the ``.bbl`` files (if any) \
|
:returns: A list of the full text of the ``.bbl`` files (if any) \
|
||||||
or ``None``.
|
or ``None``.
|
||||||
"""
|
"""
|
||||||
tf = get_sources(arxiv_id)
|
tar_file = get_sources(arxiv_id)
|
||||||
bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
|
bbl_files = [i for i in tar_file.getmembers() if i.name.endswith(".bbl")]
|
||||||
bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
|
bbl_files = [tar_file.extractfile(member).read().decode(tarfile.ENCODING)
|
||||||
for member in bbl_files]
|
for member in bbl_files]
|
||||||
return bbl_files
|
return bbl_files
|
||||||
|
|
||||||
@ -498,5 +495,5 @@ def get_citations(arxiv_id):
|
|||||||
bbl_files = get_bbl(arxiv_id)
|
bbl_files = get_bbl(arxiv_id)
|
||||||
for bbl_file in bbl_files:
|
for bbl_file in bbl_files:
|
||||||
# Fetch the cited DOIs for each of the bbl files
|
# Fetch the cited DOIs for each of the bbl files
|
||||||
dois.update(bbl.get_cited_DOIs(bbl_file))
|
dois.update(bbl.get_cited_dois(bbl_file))
|
||||||
return dois
|
return dois
|
||||||
|
@ -33,7 +33,7 @@ def is_valid(hal_id):
|
|||||||
False
|
False
|
||||||
"""
|
"""
|
||||||
match = REGEX.match(hal_id)
|
match = REGEX.match(hal_id)
|
||||||
return ((match is not None) and (match.group(0) == hal_id))
|
return (match is not None) and (match.group(0) == hal_id)
|
||||||
|
|
||||||
|
|
||||||
def extract_from_text(text):
|
def extract_from_text(text):
|
||||||
|
@ -9,9 +9,11 @@ from itertools import islice, chain
|
|||||||
|
|
||||||
# Huge URL regex taken from https://gist.github.com/gruber/8891611
|
# Huge URL regex taken from https://gist.github.com/gruber/8891611
|
||||||
URL_REGEX = re.compile(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))")
|
URL_REGEX = re.compile(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))")
|
||||||
|
_SLUGIFY_STRIP_RE = re.compile(r'[^\w\s-]')
|
||||||
|
_SLUGIFY_HYPHENATE_RE = re.compile(r'[\s]+')
|
||||||
|
|
||||||
|
|
||||||
def replaceAll(text, replace_dict):
|
def replace_all(text, replace_dict):
|
||||||
"""
|
"""
|
||||||
Replace multiple strings in a text.
|
Replace multiple strings in a text.
|
||||||
|
|
||||||
@ -26,7 +28,7 @@ def replaceAll(text, replace_dict):
|
|||||||
substitution.
|
substitution.
|
||||||
:returns: Text after replacements.
|
:returns: Text after replacements.
|
||||||
|
|
||||||
>>> replaceAll("foo bar foo thing", {"foo": "oof", "bar": "rab"})
|
>>> replace_all("foo bar foo thing", {"foo": "oof", "bar": "rab"})
|
||||||
'oof rab oof thing'
|
'oof rab oof thing'
|
||||||
"""
|
"""
|
||||||
for i, j in replace_dict.items():
|
for i, j in replace_dict.items():
|
||||||
@ -34,6 +36,24 @@ def replaceAll(text, replace_dict):
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def map_or_apply(function, param):
|
||||||
|
"""
|
||||||
|
Map the function on ``param``, or apply it, depending whether ``param`` \
|
||||||
|
is a list or an item.
|
||||||
|
|
||||||
|
:param function: The function to apply.
|
||||||
|
:param param: The parameter to feed the function with (list or item).
|
||||||
|
:returns: The computed value or ``None``.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if isinstance(param, list):
|
||||||
|
return [next(iter(function(i))) for i in param]
|
||||||
|
else:
|
||||||
|
return next(iter(function(param)))
|
||||||
|
except StopIteration:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def clean_whitespaces(text):
|
def clean_whitespaces(text):
|
||||||
"""
|
"""
|
||||||
Remove multiple whitespaces from text. Also removes leading and trailing \
|
Remove multiple whitespaces from text. Also removes leading and trailing \
|
||||||
@ -85,13 +105,13 @@ def batch(iterable, size):
|
|||||||
>>> [list(i) for i in batch([1, 2, 3, 4, 5], 2)]
|
>>> [list(i) for i in batch([1, 2, 3, 4, 5], 2)]
|
||||||
[[1, 2], [3, 4], [5]]
|
[[1, 2], [3, 4], [5]]
|
||||||
"""
|
"""
|
||||||
it = iter(iterable)
|
item = iter(iterable)
|
||||||
while True:
|
while True:
|
||||||
bi = islice(it, size)
|
batch_iterator = islice(item, size)
|
||||||
yield chain([next(bi)], bi)
|
yield chain([next(batch_iterator)], batch_iterator)
|
||||||
|
|
||||||
|
|
||||||
def remove_URLs(text):
|
def remove_urls(text):
|
||||||
"""
|
"""
|
||||||
Remove URLs from a given text (only removes http, https and naked domains \
|
Remove URLs from a given text (only removes http, https and naked domains \
|
||||||
URLs).
|
URLs).
|
||||||
@ -99,16 +119,12 @@ def remove_URLs(text):
|
|||||||
:param text: The text to remove URLs from.
|
:param text: The text to remove URLs from.
|
||||||
:returns: The text without URLs.
|
:returns: The text without URLs.
|
||||||
|
|
||||||
>>> remove_URLs("foobar http://example.com https://example.com foobar")
|
>>> remove_urls("foobar http://example.com https://example.com foobar")
|
||||||
'foobar foobar'
|
'foobar foobar'
|
||||||
"""
|
"""
|
||||||
return clean_whitespaces(URL_REGEX.sub("", text))
|
return clean_whitespaces(URL_REGEX.sub("", text))
|
||||||
|
|
||||||
|
|
||||||
_slugify_strip_re = re.compile(r'[^\w\s-]')
|
|
||||||
_slugify_hyphenate_re = re.compile(r'[\s]+')
|
|
||||||
|
|
||||||
|
|
||||||
def slugify(value):
|
def slugify(value):
|
||||||
"""
|
"""
|
||||||
Normalizes string, converts to lowercase, removes non-alpha characters,
|
Normalizes string, converts to lowercase, removes non-alpha characters,
|
||||||
@ -127,5 +143,5 @@ def slugify(value):
|
|||||||
value = unicode_type(value)
|
value = unicode_type(value)
|
||||||
value = (unicodedata.normalize('NFKD', value).
|
value = (unicodedata.normalize('NFKD', value).
|
||||||
encode('ascii', 'ignore').decode('ascii'))
|
encode('ascii', 'ignore').decode('ascii'))
|
||||||
value = unicode_type(_slugify_strip_re.sub('', value).strip())
|
value = unicode_type(_SLUGIFY_STRIP_RE.sub('', value).strip())
|
||||||
return _slugify_hyphenate_re.sub('_', value)
|
return _SLUGIFY_HYPHENATE_RE.sub('_', value)
|
||||||
|
Loading…
Reference in New Issue
Block a user