Passing pylint on the module

This commit is contained in:
Lucas Verney 2016-02-17 16:04:36 +01:00
parent a69e7ef6c1
commit a2ee654eac
16 changed files with 282 additions and 262 deletions

View File

@ -1,15 +1,11 @@
"""
libbmc
The :mod:`libbmc` is a generic Python library to manage bibliography and play
with scientific papers.
"""
# Global list of valid paper identifier types. See README.md.
__valid_identifiers__ = []
# Import order of the modules is important, as they will populate
# `__valid_identifiers__` on load, and the order in this list reflects their
# priority.
from libbmc import bibtex, doi, fetcher, isbn # noqa
from libbmc import citations, papers, repositories # noqa
__version__ = "0.1.3.1"
__all__ = [
"bibtex", "doi", "fetcher", "isbn",
"citations", "papers", "repositories",
]

View File

@ -1,5 +0,0 @@
from libbmc.citations import bbl, bibtex, pdf, plaintext
__all__ = [
"bbl", "bibtex", "pdf", "plaintext"
]

View File

@ -73,7 +73,7 @@ def get_plaintext_citations(bbl):
return cleaned_bbl
def get_cited_DOIs(bbl):
def get_cited_dois(bbl):
"""
Get the DOIs of the papers cited in a .bbl file.
@ -85,4 +85,4 @@ def get_cited_DOIs(bbl):
# Get the plaintext citations from the bbl file
plaintext_citations = get_plaintext_citations(bbl)
# Use the plaintext citations parser on these citations
return plaintext.get_cited_DOIs(plaintext_citations)
return plaintext.get_cited_dois(plaintext_citations)

View File

@ -2,15 +2,20 @@
This files contains all the functions to extract DOIs of citations from
BibTeX files.
"""
import bibtexparser
import os
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import convert_to_unicode
from libbmc import tools
from libbmc.citations import plaintext
# TODO: Use beta.dissem.in with formatted citation
def bibentry_as_plaintext(bibentry):
"""
@ -51,7 +56,7 @@ def get_plaintext_citations(bibtex):
return bibentries
def get_cited_DOIs(bibtex):
def get_cited_dois(bibtex):
"""
Get the DOIs of the papers cited in a BibTeX file.
@ -71,4 +76,4 @@ def get_cited_DOIs(bibtex):
# Get the plaintext citations from the bibtex file
plaintext_citations = get_plaintext_citations(bibtex)
# Use the plaintext citations parser on these citations
return plaintext.get_cited_DOIs(plaintext_citations)
return plaintext.get_cited_dois(plaintext_citations)

View File

@ -3,10 +3,11 @@ This files contains all the functions to extract DOIs of citations from
PDF files.
"""
import os
import requests
import subprocess
import xml.etree.ElementTree as ET
import requests
from requests.exceptions import RequestException
from libbmc import tools
@ -17,7 +18,7 @@ CERMINE_BASE_URL = "http://cermine.ceon.pl/"
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
def cermine(pdf_file, force_API=False, override_local=None):
def cermine(pdf_file, force_api=False, override_local=None):
"""
Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract metadata from \
the given PDF file, to retrieve citations (and more) from the \
@ -44,7 +45,7 @@ def cermine(pdf_file, force_API=False, override_local=None):
the CERMINE API terms.
:param pdf_file: Path to the PDF file to handle.
:param force_API: Force the use of the Cermine API \
:param force_api: Force the use of the Cermine API \
(and do not try to use a local JAR file). Defaults to ``False``.
:param override_local: Use this specific JAR file, instead of the one at \
the default location (``libbmc/external/cermine.jar``).
@ -55,23 +56,23 @@ def cermine(pdf_file, force_API=False, override_local=None):
# Check if we want to load the local JAR from a specific path
local = override_local
# Else, try to stat the JAR file at the expected local path
if (local is None) and (not force_API):
if (local is None) and (not force_api):
if os.path.isfile(os.path.join(SCRIPT_DIR,
"../external/cermine.jar")):
local = os.path.join(SCRIPT_DIR,
"../external/cermine.jar")
# If we want to force the API use, or we could not get a local JAR
if force_API or (local is None):
if force_api or (local is None):
print("Using API")
with open(pdf_file, "rb") as fh:
# Query the API
r = requests.post(
CERMINE_BASE_URL + "extract.do",
headers={"Content-Type": "application/binary"},
files={"file": fh}
)
return r.text
# Query the API
request = requests.post(
CERMINE_BASE_URL + "extract.do",
headers={"Content-Type": "application/binary"},
files={"file": fh}
)
return request.text
# Else, use the local JAR file
else:
return subprocess.check_output([
@ -86,7 +87,7 @@ def cermine(pdf_file, force_API=False, override_local=None):
return None
def cermine_dois(pdf_file, force_API=False, override_local=None):
def cermine_dois(pdf_file, force_api=False, override_local=None):
"""
Run `CERMINE <https://github.com/CeON/CERMINE>`_ to extract DOIs of cited \
papers from a PDF file.
@ -116,7 +117,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
try to match them on Crossref to get DOIs.
:param pdf_file: Path to the PDF file to handle.
:param force_API: Force the use of the Cermine API \
:param force_api: Force the use of the Cermine API \
(and do not try to use a local JAR file). Defaults to ``False``.
:param override_local: Use this specific JAR file, instead of the one at \
the default location (``libbmc/external/cermine.jar``).
@ -126,7 +127,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
# * Do not convert to plain text, but use the extra metadata from
# CERMINE
# Call CERMINE on the PDF file
cermine_output = cermine(pdf_file, force_API, override_local)
cermine_output = cermine(pdf_file, force_api, override_local)
# Parse the resulting XML
root = ET.fromstring(cermine_output)
plaintext_references = [
@ -136,7 +137,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None):
ET.tostring(e, method="text").decode("utf-8").replace(e.text, ""))
for e in root.iter("mixed-citation")]
# Call the plaintext methods to fetch DOIs
return plaintext.get_cited_DOIs(plaintext_references)
return plaintext.get_cited_dois(plaintext_references)
def grobid(pdf_folder, grobid_home=None, grobid_jar=None):
@ -156,6 +157,8 @@ def grobid(pdf_folder, grobid_home=None, grobid_jar=None):
:param grobid_jar: Path to the built Grobid JAR file.
:returns: ``True``, or ``False`` if an error occurred.
"""
# TODO: Should be using https://github.com/kermitt2/grobid-example and
# BibTeX backend.
if grobid_home is None or grobid_jar is None:
# User should pass the correct paths
return False
@ -234,4 +237,4 @@ def pdfextract_dois(pdf_file):
root = ET.fromstring(references)
plaintext_references = [e.text for e in root.iter("reference")]
# Call the plaintext methods to fetch DOIs
return plaintext.get_cited_DOIs(plaintext_references)
return plaintext.get_cited_dois(plaintext_references)

View File

@ -37,7 +37,7 @@ def get_plaintext_citations(file):
return cleaned_citations
def get_cited_DOIs(file):
def get_cited_dois(file):
"""
Get the DOIs of the papers cited in a plaintext file. The file should \
have one citation per line.
@ -66,29 +66,29 @@ def get_cited_DOIs(file):
# Try to get the DOI directly from the citation
for citation in plaintext_citations[:]:
# Some citations already contain a DOI so try to match it directly
matched_DOIs = doi.extract_from_text(citation)
if len(matched_DOIs) > 0:
matched_dois = doi.extract_from_text(citation)
if len(matched_dois) > 0:
# Add the DOI and go on
dois[citation] = next(iter(matched_DOIs))
dois[citation] = next(iter(matched_dois))
continue
# Same thing for arXiv id
matched_arXiv = arxiv.extract_from_text(citation)
if len(matched_arXiv) > 0:
matched_arxiv = arxiv.extract_from_text(citation)
if len(matched_arxiv) > 0:
# Add the associated DOI and go on
dois[citation] = arxiv.to_DOI(next(iter(matched_arXiv)))
dois[citation] = arxiv.to_doi(next(iter(matched_arxiv)))
continue
# If no match found, stack it for next step
# Note to remove URLs in the citation as the plaintext citations can
# contain URLs and they are bad for the CrossRef API.
crossref_queue.append(tools.remove_URLs(citation))
crossref_queue.append(tools.remove_urls(citation))
# Do batch with remaining papers, to prevent from the timeout of CrossRef
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
batch = [i for i in batch]
try:
# Fetch results from CrossRef
r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
for result in r.json()["results"]:
request = requests.post(CROSSREF_LINKS_API_URL, json=batch)
for result in request.json()["results"]:
# Try to get a DOI
try:
dois[result["text"]] = result["doi"]

View File

@ -55,7 +55,7 @@ def is_valid(doi):
False
"""
match = REGEX.match(doi)
return ((match is not None) and (match.group(0) == doi))
return (match is not None) and (match.group(0) == doi)
def extract_from_text(text):
@ -71,17 +71,17 @@ def extract_from_text(text):
return tools.remove_duplicates(REGEX.findall(text))
def to_URL(dois):
def to_url(dois):
"""
Convert a list of canonical DOIs to a list of DOIs URLs.
:param dois: List of canonical DOIs. Can also be a single canonical DOI.
:returns: A list of DOIs URLs (resp. a single value).
>>> to_URL(['10.1209/0295-5075/111/40005'])
>>> to_url(['10.1209/0295-5075/111/40005'])
['http://dx.doi.org/10.1209/0295-5075/111/40005']
>>> to_URL('10.1209/0295-5075/111/40005')
>>> to_url('10.1209/0295-5075/111/40005')
'http://dx.doi.org/10.1209/0295-5075/111/40005'
"""
if isinstance(dois, list):
@ -110,13 +110,7 @@ def to_canonical(urls):
>>> to_canonical(['aaaa']) is None
True
"""
try:
if isinstance(urls, list):
return [next(iter(extract_from_text(url))) for url in urls]
else:
return next(iter(extract_from_text(urls)))
except StopIteration:
return None
return tools.map_or_apply(extract_from_text, urls)
def get_oa_version(doi):
@ -134,10 +128,10 @@ def get_oa_version(doi):
'http://arxiv.org/abs/1506.06690'
"""
try:
r = requests.get("%s%s" % (DISSEMIN_API, doi))
r.raise_for_status()
result = r.json()
assert(result["status"] == "ok")
request = requests.get("%s%s" % (DISSEMIN_API, doi))
request.raise_for_status()
result = request.json()
assert result["status"] == "ok"
return result["paper"]["pdf_url"]
except (AssertionError, ValueError, KeyError, RequestException):
return None
@ -162,10 +156,10 @@ def get_oa_policy(doi):
True
"""
try:
r = requests.get("%s%s" % (DISSEMIN_API, doi))
r.raise_for_status()
result = r.json()
assert(result["status"] == "ok")
request = requests.get("%s%s" % (DISSEMIN_API, doi))
request.raise_for_status()
result = request.json()
assert result["status"] == "ok"
return ([i
for i in result["paper"]["publications"]
if i["doi"] == doi][0])["policy"]
@ -185,8 +179,8 @@ def get_linked_version(doi):
'http://stacks.iop.org/0295-5075/111/i=4/a=40005?key=crossref.9ad851948a976ecdf216d4929b0b6f01'
"""
try:
r = requests.head(to_URL(doi))
return r.headers.get("location")
request = requests.head(to_url(doi))
return request.headers.get("location")
except RequestException:
return None
@ -206,10 +200,10 @@ def get_bibtex(doi):
'@article{Verney_2015,\\n\\tdoi = {10.1209/0295-5075/111/40005},\\n\\turl = {http://dx.doi.org/10.1209/0295-5075/111/40005},\\n\\tyear = 2015,\\n\\tmonth = {aug},\\n\\tpublisher = {{IOP} Publishing},\\n\\tvolume = {111},\\n\\tnumber = {4},\\n\\tpages = {40005},\\n\\tauthor = {Lucas Verney and Lev Pitaevskii and Sandro Stringari},\\n\\ttitle = {Hybridization of first and second sound in a weakly interacting Bose gas},\\n\\tjournal = {{EPL}}\\n}'
"""
try:
r = requests.get(to_URL(doi),
headers={"accept": "application/x-bibtex"})
r.raise_for_status()
assert(r.headers.get("content-type") == "application/x-bibtex")
return r.text
request = requests.get(to_url(doi),
headers={"accept": "application/x-bibtex"})
request.raise_for_status()
assert request.headers.get("content-type") == "application/x-bibtex"
return request.text
except (RequestException, AssertionError):
return None

View File

@ -3,24 +3,89 @@ This file contains functions to download locally some papers, eventually using
a proxy.
"""
import socket
import socks
import sys
import urllib
import socks
# Default socket to use, if no proxy is used
DEFAULT_SOCKET = socket.socket
def download(url, proxies=[None]):
def _download_helper(url):
"""
Handle the download of an URL, using the proxy currently set in \
:mod:`socks`.
:param url: The URL to download.
:returns: A tuple of the raw content of the downloaded data and its \
associated content-type. Returns None if it was \
unable to download the document.
"""
# Try to fetch the URL using the current proxy
try:
request = urllib.request.urlopen(url)
try:
size = int(dict(request.info())['content-length'].strip())
except KeyError:
try:
size = int(dict(request.info())['Content-Length'].strip())
except KeyError:
size = 0
# Download the document
doc = b""
doc_size = 0
while True:
buf = request.read(1024)
if buf:
doc += buf
doc_size += len(buf)
if size != 0:
# Write progress bar on stdout
done = int(50 * doc_size / size)
sys.stdout.write("\r[%s%s]" %
('='*done, ' '*(50-done)))
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
sys.stdout.flush()
else:
break
# Fetch content type
contenttype = None
contenttype_req = None
try:
contenttype_req = dict(request.info())['content-type']
except KeyError:
try:
contenttype_req = dict(request.info())['Content-Type']
except KeyError:
return None
if 'pdf' in contenttype_req:
contenttype = 'pdf'
elif 'djvu' in contenttype_req:
contenttype = 'djvu'
# Check content type and status code are ok
if request.getcode() != 200 or contenttype is None:
# Else, try with the next available proxy
return None
# Return a tuple of the downloaded content and the content-type
return (doc, contenttype)
# If an exception occurred, continue with next available proxy
except (urllib.error.URLError, socket.error, ValueError):
return None
def download(url, proxies=None):
"""
Download a PDF or DJVU document from a url, eventually using proxies.
:params url: The URL to the PDF/DJVU document to fetch.
:params proxies: An optional list of proxies to use. Proxies will be \
used sequentially. Proxies should be a list of proxy strings. \
Do not forget to include ``None`` in the list if you want to try \
direct fetching without any proxy.
Do not forget to include ``""`` (empty string) in the list if \
you want to try direct fetching without any proxy.
:returns: A tuple of the raw content of the downloaded data and its \
associated content-type. Returns ``(None, None)`` if it was \
@ -28,10 +93,14 @@ def download(url, proxies=[None]):
>>> download("http://arxiv.org/pdf/1312.4006.pdf") # doctest: +SKIP
"""
# Handle default argument
if proxies is None:
proxies = [""]
# Loop over all available connections
for proxy in proxies:
# Handle no proxy case
if proxy is None:
if proxy == "":
socket.socket = DEFAULT_SOCKET
# Handle SOCKS proxy
elif proxy.startswith('socks'):
@ -55,58 +124,9 @@ def download(url, proxies=[None]):
socks.set_default_proxy(socks.HTTP, proxy, port)
socket.socket = socks.socksocket
# Try to fetch the URL using the current proxy
try:
r = urllib.request.urlopen(url)
try:
size = int(dict(r.info())['content-length'].strip())
except KeyError:
try:
size = int(dict(r.info())['Content-Length'].strip())
except KeyError:
size = 0
# Download the document
dl = b""
dl_size = 0
while True:
buf = r.read(1024)
if buf:
dl += buf
dl_size += len(buf)
if size != 0:
# Write progress bar on stdout
done = int(50 * dl_size / size)
sys.stdout.write("\r[%s%s]" %
('='*done, ' '*(50-done)))
sys.stdout.write(" "+str(int(float(done)/52*100))+"%")
sys.stdout.flush()
else:
break
# Fetch content type
contenttype = False
contenttype_req = None
try:
contenttype_req = dict(r.info())['content-type']
except KeyError:
try:
contenttype_req = dict(r.info())['Content-Type']
except KeyError:
continue
if 'pdf' in contenttype_req:
contenttype = 'pdf'
elif 'djvu' in contenttype_req:
contenttype = 'djvu'
# Check content type and status code are ok
if r.getcode() != 200 or contenttype is False:
# Else, try with the next available proxy
continue
# Return a tuple of the downloaded content and the content-type
return (dl, contenttype)
# If an exception occurred, continue with next available proxy
except (urllib.error.URLError, socket.error, ValueError):
continue
downloaded = _download_helper(url)
if downloaded is not None:
return downloaded
# In case of running out of proxies, return (None, None)
return (None, None)

View File

@ -11,11 +11,11 @@ from libbmc import __valid_identifiers__
__valid_identifiers__ += ["isbn"]
def is_valid(isbn):
def is_valid(isbn_id):
"""
Check that a given string is a valid ISBN.
:param isbn: the isbn to be checked.
:param isbn_id: the isbn to be checked.
:returns: boolean indicating whether the isbn is valid or not.
>>> is_valid("978-3-16-148410-0")
@ -43,9 +43,9 @@ def is_valid(isbn):
True
"""
return (
(not isbnlib.notisbn(isbn)) and (
isbnlib.get_canonical_isbn(isbn) == isbn or
isbnlib.mask(isbnlib.get_canonical_isbn(isbn)) == isbn)
(not isbnlib.notisbn(isbn_id)) and (
isbnlib.get_canonical_isbn(isbn_id) == isbn_id or
isbnlib.mask(isbnlib.get_canonical_isbn(isbn_id)) == isbn_id)
)
@ -64,28 +64,28 @@ def extract_from_text(text):
return [i for i in isbns if i is not None]
def get_bibtex(isbn):
def get_bibtex(isbn_identifier):
"""
Get a BibTeX string for the given ISBN.
:param isbn: ISBN to fetch BibTeX entry for.
:param isbn_identifier: ISBN to fetch BibTeX entry for.
:returns: A BibTeX string or ``None`` if could not fetch it.
>>> get_bibtex('9783161484100')
'@book{9783161484100,\\n title = {Berkeley, Oakland: Albany, Emeryville, Alameda, Kensington},\\n author = {Peekaboo Maps},\\n isbn = {9783161484100},\\n year = {2009},\\n publisher = {Peek A Boo Maps}\\n}'
"""
# Try to find the BibTeX using associated DOIs
bibtex = doi.get_bibtex(to_DOI(isbn))
bibtex = doi.get_bibtex(to_doi(isbn_identifier))
if bibtex is None:
# In some cases, there are no DOIs for a given ISBN. In this case, try
# to fetch bibtex directly from the ISBN, using a combination of
# Google Books and worldcat.org results.
bibtex = isbnlib.registry.bibformatters['bibtex'](
isbnlib.meta(isbn, 'default'))
isbnlib.meta(isbn_identifier, 'default'))
return bibtex
def to_DOI(isbn):
def to_doi(isbn_identifier):
"""
Make a DOI out of the given ISBN.
@ -94,16 +94,16 @@ def to_DOI(isbn):
See https://github.com/xlcnd/isbnlib#note. The returned DOI may not be
issued yet.
:param isbn: A valid ISBN string.
:param isbn_identifier: A valid ISBN string.
:returns: A DOI as string.
>>> to_DOI('9783161484100')
>>> to_doi('9783161484100')
'10.978.316/1484100'
"""
return isbnlib.doi(isbn)
return isbnlib.doi(isbn_identifier)
def from_DOI(doi):
def from_doi(doi_identifier):
"""
Make an ISBN out of the given DOI.
@ -119,10 +119,10 @@ def from_DOI(doi):
issued yet (it is a valid one, but not necessary corresponding to a
valid book).
:param doi: A valid canonical DOI.
:param doi_identifier: A valid canonical DOI.
:returns: An ISBN string.
>>> from_DOI('10.978.316/1484100')
>>> from_doi('10.978.316/1484100')
'9783161484100'
"""
return "".join(c for c in doi[2:] if c in "0123456789xX")
return "".join(c for c in doi_identifier[2:] if c in "0123456789xX")

View File

@ -1,6 +0,0 @@
from libbmc.papers import identifiers
__all__ = [
"identifiers",
"tearpages"
]

View File

@ -13,8 +13,8 @@ import sys
from libbmc import __valid_identifiers__
# Import all the modules associated to __valid_identifiers__
for type in __valid_identifiers__:
importlib.import_module("libbmc.%s" % (type,))
for valid_identifier in __valid_identifiers__:
importlib.import_module("libbmc.%s" % (valid_identifier,))
def find_identifiers(src):
@ -53,18 +53,19 @@ def find_identifiers(src):
while totext.poll() is None:
extract_full = ' '.join([i.decode("utf-8").strip()
for i in totext.stdout.readlines()])
for i in totext.stdout.readlines()])
# Loop over all the valid identifier types
for type in __valid_identifiers__:
for identifier in __valid_identifiers__:
# Dynamically call the ``extract_from_text`` method for the
# associated module.
m = sys.modules.get("libbmc.%s" % (type,), None)
if m is None:
module = sys.modules.get("libbmc.%s" % (identifier,), None)
if module is None:
continue
found_id = getattr(m, "extract_from_text")(extract_full)
found_id = getattr(module, "extract_from_text")(extract_full)
if found_id:
totext.terminate()
return (type, found_id[0]) # found_id is a list of found IDs
# found_id is a list of found IDs
return (identifier, found_id[0])
return (None, None)
@ -80,12 +81,12 @@ def get_bibtex(identifier):
:returns: A BibTeX string or ``None`` if an error occurred.
# TODO: Should return a BiBTeX object?
"""
type, id = identifier
if type not in __valid_identifiers__:
identifier_type, identifier_id = identifier
if identifier_type not in __valid_identifiers__:
return None
# Dynamically call the ``get_bibtex`` method from the associated module.
m = sys.modules.get("libbmc.%s" % (type,), None)
if m is None:
module = sys.modules.get("libbmc.%s" % (identifier_type,), None)
if module is None:
return None
return getattr(m, "get_bibtex")(id)
return getattr(module, "get_bibtex")(identifier_id)

View File

@ -21,7 +21,7 @@ BAD_JOURNALS = {
}
def fixPdf(pdfFile, destination):
def fix_pdf(pdf_file, destination):
"""
Fix malformed pdf files when data are present after '%%EOF'
@ -33,18 +33,16 @@ def fixPdf(pdfFile, destination):
:param destination: destination
"""
tmp = tempfile.NamedTemporaryFile()
output = open(tmp.name, 'wb')
with open(pdfFile, "rb") as fh:
with open(pdfFile, "rb") as fh:
with open(tmp.name, 'wb') as output:
with open(pdf_file, "rb") as fh:
for line in fh:
output.write(line)
if b'%%EOF' in line:
break
output.close()
shutil.copy(tmp.name, destination)
def tearpage_backend(filename, teared_pages=[0]):
def tearpage_backend(filename, teared_pages=None):
"""
Copy filename to a tempfile, write pages to filename except the teared one.
@ -56,29 +54,35 @@ def tearpage_backend(filename, teared_pages=[0]):
:param teared_pages: Numbers of the pages to tear. Default to first page \
only.
"""
# Handle default argument
if teared_pages is None:
teared_pages = [0]
# Copy the pdf to a tmp file
tmp = tempfile.NamedTemporaryFile()
shutil.copy(filename, tmp.name)
with tempfile.NamedTemporaryFile() as tmp:
# Copy the input file to tmp
shutil.copy(filename, tmp.name)
# Read the copied pdf
try:
input_file = PdfFileReader(open(tmp.name, 'rb'))
except PdfReadError:
fixPdf(filename, tmp.name)
input_file = PdfFileReader(open(tmp.name, 'rb'))
# Seek for the number of pages
num_pages = input_file.getNumPages()
# Read the copied pdf
# TODO: Use with syntax
try:
input_file = PdfFileReader(open(tmp.name, 'rb'))
except PdfReadError:
fix_pdf(filename, tmp.name)
input_file = PdfFileReader(open(tmp.name, 'rb'))
# Seek for the number of pages
num_pages = input_file.getNumPages()
# Write pages excepted the first one
output_file = PdfFileWriter()
for i in range(num_pages):
if i in teared_pages:
continue
output_file.addPage(input_file.getPage(i))
# Write pages excepted the first one
output_file = PdfFileWriter()
for i in range(num_pages):
if i in teared_pages:
continue
output_file.addPage(input_file.getPage(i))
tmp.close()
outputStream = open(filename, "wb")
output_file.write(outputStream)
tmp.close()
outputStream = open(filename, "wb")
output_file.write(outputStream)
def tearpage_needed(bibtex):
@ -89,16 +93,16 @@ def tearpage_needed(bibtex):
whether tearing is needed.
:returns: A list of pages to tear.
"""
for p in BAD_JOURNALS:
if p in bibtex.get("journal", "").lower():
for publisher in BAD_JOURNALS:
if publisher in bibtex.get("journal", "").lower():
# Bad journal is found, add pages to tear
return BAD_JOURNALS[p]
return BAD_JOURNALS[publisher]
# If no bad journals are found, return an empty list
return []
def tearpage(filename, bibtex=None, force=False):
def tearpage(filename, bibtex=None, force=None):
"""
Tear some pages of the file if needed.
@ -112,7 +116,7 @@ def tearpage(filename, bibtex=None, force=False):
"""
# Fetch pages to tear
pages_to_tear = []
if force is not False:
if force is not None:
pages_to_tear = force
elif bibtex is not None:
pages_to_tear = tearpage_needed(bibtex)

View File

@ -1,5 +0,0 @@
from libbmc.repositories import arxiv, hal
__all__ = [
"arxiv", "hal"
]

View File

@ -1,15 +1,18 @@
"""
This file contains all the arXiv-related functions.
"""
import arxiv2bib
import bibtexparser
import io
import re
import requests
import tarfile
import xml.etree.ElementTree
from urllib.error import HTTPError
import arxiv2bib
import bibtexparser
import requests
from requests.exceptions import RequestException
@ -268,7 +271,7 @@ def is_valid(arxiv_id):
False
"""
match = REGEX.match(arxiv_id)
return ((match is not None) and (match.group(0) == arxiv_id))
return (match is not None) and (match.group(0) == arxiv_id)
def get_bibtex(arxiv_id):
@ -320,17 +323,17 @@ def extract_from_text(text):
for i in REGEX.findall(text) if i[0] != ''])
def to_URL(arxiv_ids):
def to_url(arxiv_ids):
"""
Convert a list of canonical DOIs to a list of DOIs URLs.
:param dois: List of canonical DOIs.
:returns: A list of DOIs URLs.
>>> to_URL('1506.06690')
>>> to_url('1506.06690')
'http://arxiv.org/abs/1506.06690'
>>> to_URL('1506.06690v1')
>>> to_url('1506.06690v1')
'http://arxiv.org/abs/1506.06690v1'
"""
if isinstance(arxiv_ids, list):
@ -358,16 +361,10 @@ def to_canonical(urls):
>>> to_canonical('aaa') is None
True
"""
try:
if isinstance(urls, list):
return [next(iter(extract_from_text(url))) for url in urls]
else:
return next(iter(extract_from_text(urls)))
except StopIteration:
return None
return tools.map_or_apply(extract_from_text, urls)
def from_DOI(doi):
def from_doi(doi):
"""
Get the arXiv eprint id for a given DOI.
@ -379,29 +376,29 @@ def from_DOI(doi):
:param doi: The DOI of the resource to look for.
:returns: The arXiv eprint id, or ``None`` if not found.
>>> from_DOI('10.1209/0295-5075/111/40005')
>>> from_doi('10.1209/0295-5075/111/40005')
# Note: Test do not pass due to an arXiv API bug.
'1506.06690'
"""
try:
r = requests.get("http://export.arxiv.org/api/query",
params={
"search_query": "doi:%s" % (doi,),
"max_results": 1
})
r.raise_for_status()
request = requests.get("http://export.arxiv.org/api/query",
params={
"search_query": "doi:%s" % (doi,),
"max_results": 1
})
request.raise_for_status()
except RequestException:
return None
e = xml.etree.ElementTree.fromstring(r.content)
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
id = entry.find("{http://www.w3.org/2005/Atom}id").text
# id is an arXiv full URL. We only want the id which is the last URL
# component.
return id.split("/")[-1]
root = xml.etree.ElementTree.fromstring(request.content)
for entry in root.iter("{http://www.w3.org/2005/Atom}entry"):
arxiv_id = entry.find("{http://www.w3.org/2005/Atom}id").text
# arxiv_id is an arXiv full URL. We only want the id which is the last
# URL component.
return arxiv_id.split("/")[-1]
return None
def to_DOI(arxiv_id):
def to_doi(arxiv_id):
"""
Get the associated DOI for a given arXiv eprint.
@ -413,23 +410,23 @@ def to_DOI(arxiv_id):
:param eprint: The arXiv eprint id.
:returns: The DOI if any, or ``None``.
>>> to_DOI('1506.06690v1')
>>> to_doi('1506.06690v1')
'10.1209/0295-5075/111/40005'
>>> to_DOI('1506.06690')
>>> to_doi('1506.06690')
'10.1209/0295-5075/111/40005'
"""
try:
r = requests.get("http://export.arxiv.org/api/query",
params={
"id_list": arxiv_id,
"max_results": 1
})
r.raise_for_status()
request = requests.get("http://export.arxiv.org/api/query",
params={
"id_list": arxiv_id,
"max_results": 1
})
request.raise_for_status()
except RequestException:
return None
e = xml.etree.ElementTree.fromstring(r.content)
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
root = xml.etree.ElementTree.fromstring(request.content)
for entry in root.iter("{http://www.w3.org/2005/Atom}entry"):
doi = entry.find("{http://arxiv.org/schemas/atom}doi")
if doi is not None:
return doi.text
@ -451,9 +448,9 @@ def get_sources(arxiv_id):
``None``.
"""
try:
r = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))
r.raise_for_status()
file_object = io.BytesIO(r.content)
request = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))
request.raise_for_status()
file_object = io.BytesIO(request.content)
return tarfile.open(fileobj=file_object)
except (RequestException, AssertionError, tarfile.TarError):
return None
@ -473,9 +470,9 @@ def get_bbl(arxiv_id):
:returns: A list of the full text of the ``.bbl`` files (if any) \
or ``None``.
"""
tf = get_sources(arxiv_id)
bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
tar_file = get_sources(arxiv_id)
bbl_files = [i for i in tar_file.getmembers() if i.name.endswith(".bbl")]
bbl_files = [tar_file.extractfile(member).read().decode(tarfile.ENCODING)
for member in bbl_files]
return bbl_files
@ -498,5 +495,5 @@ def get_citations(arxiv_id):
bbl_files = get_bbl(arxiv_id)
for bbl_file in bbl_files:
# Fetch the cited DOIs for each of the bbl files
dois.update(bbl.get_cited_DOIs(bbl_file))
dois.update(bbl.get_cited_dois(bbl_file))
return dois

View File

@ -33,7 +33,7 @@ def is_valid(hal_id):
False
"""
match = REGEX.match(hal_id)
return ((match is not None) and (match.group(0) == hal_id))
return (match is not None) and (match.group(0) == hal_id)
def extract_from_text(text):

View File

@ -9,9 +9,11 @@ from itertools import islice, chain
# Huge URL regex taken from https://gist.github.com/gruber/8891611
URL_REGEX = re.compile(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))")
_SLUGIFY_STRIP_RE = re.compile(r'[^\w\s-]')
_SLUGIFY_HYPHENATE_RE = re.compile(r'[\s]+')
def replaceAll(text, replace_dict):
def replace_all(text, replace_dict):
"""
Replace multiple strings in a text.
@ -26,7 +28,7 @@ def replaceAll(text, replace_dict):
substitution.
:returns: Text after replacements.
>>> replaceAll("foo bar foo thing", {"foo": "oof", "bar": "rab"})
>>> replace_all("foo bar foo thing", {"foo": "oof", "bar": "rab"})
'oof rab oof thing'
"""
for i, j in replace_dict.items():
@ -34,6 +36,24 @@ def replaceAll(text, replace_dict):
return text
def map_or_apply(function, param):
"""
Map the function on ``param``, or apply it, depending whether ``param`` \
is a list or an item.
:param function: The function to apply.
:param param: The parameter to feed the function with (list or item).
:returns: The computed value or ``None``.
"""
try:
if isinstance(param, list):
return [next(iter(function(i))) for i in param]
else:
return next(iter(function(param)))
except StopIteration:
return None
def clean_whitespaces(text):
"""
Remove multiple whitespaces from text. Also removes leading and trailing \
@ -85,13 +105,13 @@ def batch(iterable, size):
>>> [list(i) for i in batch([1, 2, 3, 4, 5], 2)]
[[1, 2], [3, 4], [5]]
"""
it = iter(iterable)
item = iter(iterable)
while True:
bi = islice(it, size)
yield chain([next(bi)], bi)
batch_iterator = islice(item, size)
yield chain([next(batch_iterator)], batch_iterator)
def remove_URLs(text):
def remove_urls(text):
"""
Remove URLs from a given text (only removes http, https and naked domains \
URLs).
@ -99,16 +119,12 @@ def remove_URLs(text):
:param text: The text to remove URLs from.
:returns: The text without URLs.
>>> remove_URLs("foobar http://example.com https://example.com foobar")
>>> remove_urls("foobar http://example.com https://example.com foobar")
'foobar foobar'
"""
return clean_whitespaces(URL_REGEX.sub("", text))
_slugify_strip_re = re.compile(r'[^\w\s-]')
_slugify_hyphenate_re = re.compile(r'[\s]+')
def slugify(value):
"""
Normalizes string, converts to lowercase, removes non-alpha characters,
@ -127,5 +143,5 @@ def slugify(value):
value = unicode_type(value)
value = (unicodedata.normalize('NFKD', value).
encode('ascii', 'ignore').decode('ascii'))
value = unicode_type(_slugify_strip_re.sub('', value).strip())
return _slugify_hyphenate_re.sub('_', value)
value = unicode_type(_SLUGIFY_STRIP_RE.sub('', value).strip())
return _SLUGIFY_HYPHENATE_RE.sub('_', value)