diff --git a/libbmc/__init__.py b/libbmc/__init__.py index cd75f8b..6ab83eb 100644 --- a/libbmc/__init__.py +++ b/libbmc/__init__.py @@ -1,15 +1,11 @@ +""" +libbmc + +The :mod:`libbmc` is a generic Python library to manage bibliography and play +with scientific papers. +""" + # Global list of valid paper identifier types. See README.md. __valid_identifiers__ = [] -# Import order of the modules is important, as they will populate -# `__valid_identifiers__` on load, and the order in this list reflects their -# priority. -from libbmc import bibtex, doi, fetcher, isbn # noqa -from libbmc import citations, papers, repositories # noqa - __version__ = "0.1.3.1" - -__all__ = [ - "bibtex", "doi", "fetcher", "isbn", - "citations", "papers", "repositories", -] diff --git a/libbmc/citations/__init__.py b/libbmc/citations/__init__.py index 5f42dd7..e69de29 100644 --- a/libbmc/citations/__init__.py +++ b/libbmc/citations/__init__.py @@ -1,5 +0,0 @@ -from libbmc.citations import bbl, bibtex, pdf, plaintext - -__all__ = [ - "bbl", "bibtex", "pdf", "plaintext" -] diff --git a/libbmc/citations/bbl.py b/libbmc/citations/bbl.py index 3fde99e..71be937 100644 --- a/libbmc/citations/bbl.py +++ b/libbmc/citations/bbl.py @@ -73,7 +73,7 @@ def get_plaintext_citations(bbl): return cleaned_bbl -def get_cited_DOIs(bbl): +def get_cited_dois(bbl): """ Get the DOIs of the papers cited in a .bbl file. @@ -85,4 +85,4 @@ def get_cited_DOIs(bbl): # Get the plaintext citations from the bbl file plaintext_citations = get_plaintext_citations(bbl) # Use the plaintext citations parser on these citations - return plaintext.get_cited_DOIs(plaintext_citations) + return plaintext.get_cited_dois(plaintext_citations) diff --git a/libbmc/citations/bibtex.py b/libbmc/citations/bibtex.py index 4c357d8..7d03bc3 100644 --- a/libbmc/citations/bibtex.py +++ b/libbmc/citations/bibtex.py @@ -2,15 +2,20 @@ This files contains all the functions to extract DOIs of citations from BibTeX files. """ -import bibtexparser import os + +import bibtexparser + from bibtexparser.bparser import BibTexParser from bibtexparser.customization import convert_to_unicode + from libbmc import tools from libbmc.citations import plaintext +# TODO: Use beta.dissem.in with formatted citation + def bibentry_as_plaintext(bibentry): """ @@ -51,7 +56,7 @@ def get_plaintext_citations(bibtex): return bibentries -def get_cited_DOIs(bibtex): +def get_cited_dois(bibtex): """ Get the DOIs of the papers cited in a BibTeX file. @@ -71,4 +76,4 @@ def get_cited_DOIs(bibtex): # Get the plaintext citations from the bibtex file plaintext_citations = get_plaintext_citations(bibtex) # Use the plaintext citations parser on these citations - return plaintext.get_cited_DOIs(plaintext_citations) + return plaintext.get_cited_dois(plaintext_citations) diff --git a/libbmc/citations/pdf.py b/libbmc/citations/pdf.py index f64baa1..d6f8eb4 100644 --- a/libbmc/citations/pdf.py +++ b/libbmc/citations/pdf.py @@ -3,10 +3,11 @@ This files contains all the functions to extract DOIs of citations from PDF files. """ import os -import requests import subprocess import xml.etree.ElementTree as ET +import requests + from requests.exceptions import RequestException from libbmc import tools @@ -17,7 +18,7 @@ CERMINE_BASE_URL = "http://cermine.ceon.pl/" SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) -def cermine(pdf_file, force_API=False, override_local=None): +def cermine(pdf_file, force_api=False, override_local=None): """ Run `CERMINE `_ to extract metadata from \ the given PDF file, to retrieve citations (and more) from the \ @@ -44,7 +45,7 @@ def cermine(pdf_file, force_API=False, override_local=None): the CERMINE API terms. :param pdf_file: Path to the PDF file to handle. - :param force_API: Force the use of the Cermine API \ + :param force_api: Force the use of the Cermine API \ (and do not try to use a local JAR file). Defaults to ``False``. :param override_local: Use this specific JAR file, instead of the one at \ the default location (``libbmc/external/cermine.jar``). @@ -55,23 +56,23 @@ def cermine(pdf_file, force_API=False, override_local=None): # Check if we want to load the local JAR from a specific path local = override_local # Else, try to stat the JAR file at the expected local path - if (local is None) and (not force_API): + if (local is None) and (not force_api): if os.path.isfile(os.path.join(SCRIPT_DIR, "../external/cermine.jar")): local = os.path.join(SCRIPT_DIR, "../external/cermine.jar") # If we want to force the API use, or we could not get a local JAR - if force_API or (local is None): + if force_api or (local is None): print("Using API") with open(pdf_file, "rb") as fh: - # Query the API - r = requests.post( - CERMINE_BASE_URL + "extract.do", - headers={"Content-Type": "application/binary"}, - files={"file": fh} - ) - return r.text + # Query the API + request = requests.post( + CERMINE_BASE_URL + "extract.do", + headers={"Content-Type": "application/binary"}, + files={"file": fh} + ) + return request.text # Else, use the local JAR file else: return subprocess.check_output([ @@ -86,7 +87,7 @@ def cermine(pdf_file, force_API=False, override_local=None): return None -def cermine_dois(pdf_file, force_API=False, override_local=None): +def cermine_dois(pdf_file, force_api=False, override_local=None): """ Run `CERMINE `_ to extract DOIs of cited \ papers from a PDF file. @@ -116,7 +117,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None): try to match them on Crossref to get DOIs. :param pdf_file: Path to the PDF file to handle. - :param force_API: Force the use of the Cermine API \ + :param force_api: Force the use of the Cermine API \ (and do not try to use a local JAR file). Defaults to ``False``. :param override_local: Use this specific JAR file, instead of the one at \ the default location (``libbmc/external/cermine.jar``). @@ -126,7 +127,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None): # * Do not convert to plain text, but use the extra metadata from # CERMINE # Call CERMINE on the PDF file - cermine_output = cermine(pdf_file, force_API, override_local) + cermine_output = cermine(pdf_file, force_api, override_local) # Parse the resulting XML root = ET.fromstring(cermine_output) plaintext_references = [ @@ -136,7 +137,7 @@ def cermine_dois(pdf_file, force_API=False, override_local=None): ET.tostring(e, method="text").decode("utf-8").replace(e.text, "")) for e in root.iter("mixed-citation")] # Call the plaintext methods to fetch DOIs - return plaintext.get_cited_DOIs(plaintext_references) + return plaintext.get_cited_dois(plaintext_references) def grobid(pdf_folder, grobid_home=None, grobid_jar=None): @@ -156,6 +157,8 @@ def grobid(pdf_folder, grobid_home=None, grobid_jar=None): :param grobid_jar: Path to the built Grobid JAR file. :returns: ``True``, or ``False`` if an error occurred. """ + # TODO: Should be using https://github.com/kermitt2/grobid-example and + # BibTeX backend. if grobid_home is None or grobid_jar is None: # User should pass the correct paths return False @@ -234,4 +237,4 @@ def pdfextract_dois(pdf_file): root = ET.fromstring(references) plaintext_references = [e.text for e in root.iter("reference")] # Call the plaintext methods to fetch DOIs - return plaintext.get_cited_DOIs(plaintext_references) + return plaintext.get_cited_dois(plaintext_references) diff --git a/libbmc/citations/plaintext.py b/libbmc/citations/plaintext.py index bdd4f19..16647a7 100644 --- a/libbmc/citations/plaintext.py +++ b/libbmc/citations/plaintext.py @@ -37,7 +37,7 @@ def get_plaintext_citations(file): return cleaned_citations -def get_cited_DOIs(file): +def get_cited_dois(file): """ Get the DOIs of the papers cited in a plaintext file. The file should \ have one citation per line. @@ -66,29 +66,29 @@ def get_cited_DOIs(file): # Try to get the DOI directly from the citation for citation in plaintext_citations[:]: # Some citations already contain a DOI so try to match it directly - matched_DOIs = doi.extract_from_text(citation) - if len(matched_DOIs) > 0: + matched_dois = doi.extract_from_text(citation) + if len(matched_dois) > 0: # Add the DOI and go on - dois[citation] = next(iter(matched_DOIs)) + dois[citation] = next(iter(matched_dois)) continue # Same thing for arXiv id - matched_arXiv = arxiv.extract_from_text(citation) - if len(matched_arXiv) > 0: + matched_arxiv = arxiv.extract_from_text(citation) + if len(matched_arxiv) > 0: # Add the associated DOI and go on - dois[citation] = arxiv.to_DOI(next(iter(matched_arXiv))) + dois[citation] = arxiv.to_doi(next(iter(matched_arxiv))) continue # If no match found, stack it for next step # Note to remove URLs in the citation as the plaintext citations can # contain URLs and they are bad for the CrossRef API. - crossref_queue.append(tools.remove_URLs(citation)) + crossref_queue.append(tools.remove_urls(citation)) # Do batch with remaining papers, to prevent from the timeout of CrossRef for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE): batch = [i for i in batch] try: # Fetch results from CrossRef - r = requests.post(CROSSREF_LINKS_API_URL, json=batch) - for result in r.json()["results"]: + request = requests.post(CROSSREF_LINKS_API_URL, json=batch) + for result in request.json()["results"]: # Try to get a DOI try: dois[result["text"]] = result["doi"] diff --git a/libbmc/doi.py b/libbmc/doi.py index bdd9f1e..5f539a6 100644 --- a/libbmc/doi.py +++ b/libbmc/doi.py @@ -55,7 +55,7 @@ def is_valid(doi): False """ match = REGEX.match(doi) - return ((match is not None) and (match.group(0) == doi)) + return (match is not None) and (match.group(0) == doi) def extract_from_text(text): @@ -71,17 +71,17 @@ def extract_from_text(text): return tools.remove_duplicates(REGEX.findall(text)) -def to_URL(dois): +def to_url(dois): """ Convert a list of canonical DOIs to a list of DOIs URLs. :param dois: List of canonical DOIs. Can also be a single canonical DOI. :returns: A list of DOIs URLs (resp. a single value). - >>> to_URL(['10.1209/0295-5075/111/40005']) + >>> to_url(['10.1209/0295-5075/111/40005']) ['http://dx.doi.org/10.1209/0295-5075/111/40005'] - >>> to_URL('10.1209/0295-5075/111/40005') + >>> to_url('10.1209/0295-5075/111/40005') 'http://dx.doi.org/10.1209/0295-5075/111/40005' """ if isinstance(dois, list): @@ -110,13 +110,7 @@ def to_canonical(urls): >>> to_canonical(['aaaa']) is None True """ - try: - if isinstance(urls, list): - return [next(iter(extract_from_text(url))) for url in urls] - else: - return next(iter(extract_from_text(urls))) - except StopIteration: - return None + return tools.map_or_apply(extract_from_text, urls) def get_oa_version(doi): @@ -134,10 +128,10 @@ def get_oa_version(doi): 'http://arxiv.org/abs/1506.06690' """ try: - r = requests.get("%s%s" % (DISSEMIN_API, doi)) - r.raise_for_status() - result = r.json() - assert(result["status"] == "ok") + request = requests.get("%s%s" % (DISSEMIN_API, doi)) + request.raise_for_status() + result = request.json() + assert result["status"] == "ok" return result["paper"]["pdf_url"] except (AssertionError, ValueError, KeyError, RequestException): return None @@ -162,10 +156,10 @@ def get_oa_policy(doi): True """ try: - r = requests.get("%s%s" % (DISSEMIN_API, doi)) - r.raise_for_status() - result = r.json() - assert(result["status"] == "ok") + request = requests.get("%s%s" % (DISSEMIN_API, doi)) + request.raise_for_status() + result = request.json() + assert result["status"] == "ok" return ([i for i in result["paper"]["publications"] if i["doi"] == doi][0])["policy"] @@ -185,8 +179,8 @@ def get_linked_version(doi): 'http://stacks.iop.org/0295-5075/111/i=4/a=40005?key=crossref.9ad851948a976ecdf216d4929b0b6f01' """ try: - r = requests.head(to_URL(doi)) - return r.headers.get("location") + request = requests.head(to_url(doi)) + return request.headers.get("location") except RequestException: return None @@ -206,10 +200,10 @@ def get_bibtex(doi): '@article{Verney_2015,\\n\\tdoi = {10.1209/0295-5075/111/40005},\\n\\turl = {http://dx.doi.org/10.1209/0295-5075/111/40005},\\n\\tyear = 2015,\\n\\tmonth = {aug},\\n\\tpublisher = {{IOP} Publishing},\\n\\tvolume = {111},\\n\\tnumber = {4},\\n\\tpages = {40005},\\n\\tauthor = {Lucas Verney and Lev Pitaevskii and Sandro Stringari},\\n\\ttitle = {Hybridization of first and second sound in a weakly interacting Bose gas},\\n\\tjournal = {{EPL}}\\n}' """ try: - r = requests.get(to_URL(doi), - headers={"accept": "application/x-bibtex"}) - r.raise_for_status() - assert(r.headers.get("content-type") == "application/x-bibtex") - return r.text + request = requests.get(to_url(doi), + headers={"accept": "application/x-bibtex"}) + request.raise_for_status() + assert request.headers.get("content-type") == "application/x-bibtex" + return request.text except (RequestException, AssertionError): return None diff --git a/libbmc/fetcher.py b/libbmc/fetcher.py index f7473e4..3e0960c 100644 --- a/libbmc/fetcher.py +++ b/libbmc/fetcher.py @@ -3,24 +3,89 @@ This file contains functions to download locally some papers, eventually using a proxy. """ import socket -import socks import sys import urllib +import socks + # Default socket to use, if no proxy is used DEFAULT_SOCKET = socket.socket -def download(url, proxies=[None]): +def _download_helper(url): + """ + Handle the download of an URL, using the proxy currently set in \ + :mod:`socks`. + + :param url: The URL to download. + :returns: A tuple of the raw content of the downloaded data and its \ + associated content-type. Returns None if it was \ + unable to download the document. + """ + # Try to fetch the URL using the current proxy + try: + request = urllib.request.urlopen(url) + try: + size = int(dict(request.info())['content-length'].strip()) + except KeyError: + try: + size = int(dict(request.info())['Content-Length'].strip()) + except KeyError: + size = 0 + # Download the document + doc = b"" + doc_size = 0 + while True: + buf = request.read(1024) + if buf: + doc += buf + doc_size += len(buf) + if size != 0: + # Write progress bar on stdout + done = int(50 * doc_size / size) + sys.stdout.write("\r[%s%s]" % + ('='*done, ' '*(50-done))) + sys.stdout.write(" "+str(int(float(done)/52*100))+"%") + sys.stdout.flush() + else: + break + # Fetch content type + contenttype = None + contenttype_req = None + try: + contenttype_req = dict(request.info())['content-type'] + except KeyError: + try: + contenttype_req = dict(request.info())['Content-Type'] + except KeyError: + return None + if 'pdf' in contenttype_req: + contenttype = 'pdf' + elif 'djvu' in contenttype_req: + contenttype = 'djvu' + + # Check content type and status code are ok + if request.getcode() != 200 or contenttype is None: + # Else, try with the next available proxy + return None + + # Return a tuple of the downloaded content and the content-type + return (doc, contenttype) + # If an exception occurred, continue with next available proxy + except (urllib.error.URLError, socket.error, ValueError): + return None + + +def download(url, proxies=None): """ Download a PDF or DJVU document from a url, eventually using proxies. :params url: The URL to the PDF/DJVU document to fetch. :params proxies: An optional list of proxies to use. Proxies will be \ used sequentially. Proxies should be a list of proxy strings. \ - Do not forget to include ``None`` in the list if you want to try \ - direct fetching without any proxy. + Do not forget to include ``""`` (empty string) in the list if \ + you want to try direct fetching without any proxy. :returns: A tuple of the raw content of the downloaded data and its \ associated content-type. Returns ``(None, None)`` if it was \ @@ -28,10 +93,14 @@ def download(url, proxies=[None]): >>> download("http://arxiv.org/pdf/1312.4006.pdf") # doctest: +SKIP """ + # Handle default argument + if proxies is None: + proxies = [""] + # Loop over all available connections for proxy in proxies: # Handle no proxy case - if proxy is None: + if proxy == "": socket.socket = DEFAULT_SOCKET # Handle SOCKS proxy elif proxy.startswith('socks'): @@ -55,58 +124,9 @@ def download(url, proxies=[None]): socks.set_default_proxy(socks.HTTP, proxy, port) socket.socket = socks.socksocket - # Try to fetch the URL using the current proxy - try: - r = urllib.request.urlopen(url) - try: - size = int(dict(r.info())['content-length'].strip()) - except KeyError: - try: - size = int(dict(r.info())['Content-Length'].strip()) - except KeyError: - size = 0 - # Download the document - dl = b"" - dl_size = 0 - while True: - buf = r.read(1024) - if buf: - dl += buf - dl_size += len(buf) - if size != 0: - # Write progress bar on stdout - done = int(50 * dl_size / size) - sys.stdout.write("\r[%s%s]" % - ('='*done, ' '*(50-done))) - sys.stdout.write(" "+str(int(float(done)/52*100))+"%") - sys.stdout.flush() - else: - break - # Fetch content type - contenttype = False - contenttype_req = None - try: - contenttype_req = dict(r.info())['content-type'] - except KeyError: - try: - contenttype_req = dict(r.info())['Content-Type'] - except KeyError: - continue - if 'pdf' in contenttype_req: - contenttype = 'pdf' - elif 'djvu' in contenttype_req: - contenttype = 'djvu' - - # Check content type and status code are ok - if r.getcode() != 200 or contenttype is False: - # Else, try with the next available proxy - continue - - # Return a tuple of the downloaded content and the content-type - return (dl, contenttype) - # If an exception occurred, continue with next available proxy - except (urllib.error.URLError, socket.error, ValueError): - continue + downloaded = _download_helper(url) + if downloaded is not None: + return downloaded # In case of running out of proxies, return (None, None) return (None, None) diff --git a/libbmc/isbn.py b/libbmc/isbn.py index fe0e1c4..eb64be9 100644 --- a/libbmc/isbn.py +++ b/libbmc/isbn.py @@ -11,11 +11,11 @@ from libbmc import __valid_identifiers__ __valid_identifiers__ += ["isbn"] -def is_valid(isbn): +def is_valid(isbn_id): """ Check that a given string is a valid ISBN. - :param isbn: the isbn to be checked. + :param isbn_id: the isbn to be checked. :returns: boolean indicating whether the isbn is valid or not. >>> is_valid("978-3-16-148410-0") @@ -43,9 +43,9 @@ def is_valid(isbn): True """ return ( - (not isbnlib.notisbn(isbn)) and ( - isbnlib.get_canonical_isbn(isbn) == isbn or - isbnlib.mask(isbnlib.get_canonical_isbn(isbn)) == isbn) + (not isbnlib.notisbn(isbn_id)) and ( + isbnlib.get_canonical_isbn(isbn_id) == isbn_id or + isbnlib.mask(isbnlib.get_canonical_isbn(isbn_id)) == isbn_id) ) @@ -64,28 +64,28 @@ def extract_from_text(text): return [i for i in isbns if i is not None] -def get_bibtex(isbn): +def get_bibtex(isbn_identifier): """ Get a BibTeX string for the given ISBN. - :param isbn: ISBN to fetch BibTeX entry for. + :param isbn_identifier: ISBN to fetch BibTeX entry for. :returns: A BibTeX string or ``None`` if could not fetch it. >>> get_bibtex('9783161484100') '@book{9783161484100,\\n title = {Berkeley, Oakland: Albany, Emeryville, Alameda, Kensington},\\n author = {Peekaboo Maps},\\n isbn = {9783161484100},\\n year = {2009},\\n publisher = {Peek A Boo Maps}\\n}' """ # Try to find the BibTeX using associated DOIs - bibtex = doi.get_bibtex(to_DOI(isbn)) + bibtex = doi.get_bibtex(to_doi(isbn_identifier)) if bibtex is None: # In some cases, there are no DOIs for a given ISBN. In this case, try # to fetch bibtex directly from the ISBN, using a combination of # Google Books and worldcat.org results. bibtex = isbnlib.registry.bibformatters['bibtex']( - isbnlib.meta(isbn, 'default')) + isbnlib.meta(isbn_identifier, 'default')) return bibtex -def to_DOI(isbn): +def to_doi(isbn_identifier): """ Make a DOI out of the given ISBN. @@ -94,16 +94,16 @@ def to_DOI(isbn): See https://github.com/xlcnd/isbnlib#note. The returned DOI may not be issued yet. - :param isbn: A valid ISBN string. + :param isbn_identifier: A valid ISBN string. :returns: A DOI as string. - >>> to_DOI('9783161484100') + >>> to_doi('9783161484100') '10.978.316/1484100' """ - return isbnlib.doi(isbn) + return isbnlib.doi(isbn_identifier) -def from_DOI(doi): +def from_doi(doi_identifier): """ Make an ISBN out of the given DOI. @@ -119,10 +119,10 @@ def from_DOI(doi): issued yet (it is a valid one, but not necessary corresponding to a valid book). - :param doi: A valid canonical DOI. + :param doi_identifier: A valid canonical DOI. :returns: An ISBN string. - >>> from_DOI('10.978.316/1484100') + >>> from_doi('10.978.316/1484100') '9783161484100' """ - return "".join(c for c in doi[2:] if c in "0123456789xX") + return "".join(c for c in doi_identifier[2:] if c in "0123456789xX") diff --git a/libbmc/papers/__init__.py b/libbmc/papers/__init__.py index 7c48d92..e69de29 100644 --- a/libbmc/papers/__init__.py +++ b/libbmc/papers/__init__.py @@ -1,6 +0,0 @@ -from libbmc.papers import identifiers - -__all__ = [ - "identifiers", - "tearpages" -] diff --git a/libbmc/papers/identifiers.py b/libbmc/papers/identifiers.py index abca898..a633d5a 100644 --- a/libbmc/papers/identifiers.py +++ b/libbmc/papers/identifiers.py @@ -13,8 +13,8 @@ import sys from libbmc import __valid_identifiers__ # Import all the modules associated to __valid_identifiers__ -for type in __valid_identifiers__: - importlib.import_module("libbmc.%s" % (type,)) +for valid_identifier in __valid_identifiers__: + importlib.import_module("libbmc.%s" % (valid_identifier,)) def find_identifiers(src): @@ -53,18 +53,19 @@ def find_identifiers(src): while totext.poll() is None: extract_full = ' '.join([i.decode("utf-8").strip() - for i in totext.stdout.readlines()]) + for i in totext.stdout.readlines()]) # Loop over all the valid identifier types - for type in __valid_identifiers__: + for identifier in __valid_identifiers__: # Dynamically call the ``extract_from_text`` method for the # associated module. - m = sys.modules.get("libbmc.%s" % (type,), None) - if m is None: + module = sys.modules.get("libbmc.%s" % (identifier,), None) + if module is None: continue - found_id = getattr(m, "extract_from_text")(extract_full) + found_id = getattr(module, "extract_from_text")(extract_full) if found_id: totext.terminate() - return (type, found_id[0]) # found_id is a list of found IDs + # found_id is a list of found IDs + return (identifier, found_id[0]) return (None, None) @@ -80,12 +81,12 @@ def get_bibtex(identifier): :returns: A BibTeX string or ``None`` if an error occurred. # TODO: Should return a BiBTeX object? """ - type, id = identifier - if type not in __valid_identifiers__: + identifier_type, identifier_id = identifier + if identifier_type not in __valid_identifiers__: return None # Dynamically call the ``get_bibtex`` method from the associated module. - m = sys.modules.get("libbmc.%s" % (type,), None) - if m is None: + module = sys.modules.get("libbmc.%s" % (identifier_type,), None) + if module is None: return None - return getattr(m, "get_bibtex")(id) + return getattr(module, "get_bibtex")(identifier_id) diff --git a/libbmc/papers/tearpages.py b/libbmc/papers/tearpages.py index a58af58..0262c32 100644 --- a/libbmc/papers/tearpages.py +++ b/libbmc/papers/tearpages.py @@ -21,7 +21,7 @@ BAD_JOURNALS = { } -def fixPdf(pdfFile, destination): +def fix_pdf(pdf_file, destination): """ Fix malformed pdf files when data are present after '%%EOF' @@ -33,18 +33,16 @@ def fixPdf(pdfFile, destination): :param destination: destination """ tmp = tempfile.NamedTemporaryFile() - output = open(tmp.name, 'wb') - with open(pdfFile, "rb") as fh: - with open(pdfFile, "rb") as fh: + with open(tmp.name, 'wb') as output: + with open(pdf_file, "rb") as fh: for line in fh: output.write(line) if b'%%EOF' in line: break - output.close() shutil.copy(tmp.name, destination) -def tearpage_backend(filename, teared_pages=[0]): +def tearpage_backend(filename, teared_pages=None): """ Copy filename to a tempfile, write pages to filename except the teared one. @@ -56,29 +54,35 @@ def tearpage_backend(filename, teared_pages=[0]): :param teared_pages: Numbers of the pages to tear. Default to first page \ only. """ + # Handle default argument + if teared_pages is None: + teared_pages = [0] + # Copy the pdf to a tmp file - tmp = tempfile.NamedTemporaryFile() - shutil.copy(filename, tmp.name) + with tempfile.NamedTemporaryFile() as tmp: + # Copy the input file to tmp + shutil.copy(filename, tmp.name) - # Read the copied pdf - try: - input_file = PdfFileReader(open(tmp.name, 'rb')) - except PdfReadError: - fixPdf(filename, tmp.name) - input_file = PdfFileReader(open(tmp.name, 'rb')) - # Seek for the number of pages - num_pages = input_file.getNumPages() + # Read the copied pdf + # TODO: Use with syntax + try: + input_file = PdfFileReader(open(tmp.name, 'rb')) + except PdfReadError: + fix_pdf(filename, tmp.name) + input_file = PdfFileReader(open(tmp.name, 'rb')) + # Seek for the number of pages + num_pages = input_file.getNumPages() - # Write pages excepted the first one - output_file = PdfFileWriter() - for i in range(num_pages): - if i in teared_pages: - continue - output_file.addPage(input_file.getPage(i)) + # Write pages excepted the first one + output_file = PdfFileWriter() + for i in range(num_pages): + if i in teared_pages: + continue + output_file.addPage(input_file.getPage(i)) - tmp.close() - outputStream = open(filename, "wb") - output_file.write(outputStream) + tmp.close() + outputStream = open(filename, "wb") + output_file.write(outputStream) def tearpage_needed(bibtex): @@ -89,16 +93,16 @@ def tearpage_needed(bibtex): whether tearing is needed. :returns: A list of pages to tear. """ - for p in BAD_JOURNALS: - if p in bibtex.get("journal", "").lower(): + for publisher in BAD_JOURNALS: + if publisher in bibtex.get("journal", "").lower(): # Bad journal is found, add pages to tear - return BAD_JOURNALS[p] + return BAD_JOURNALS[publisher] # If no bad journals are found, return an empty list return [] -def tearpage(filename, bibtex=None, force=False): +def tearpage(filename, bibtex=None, force=None): """ Tear some pages of the file if needed. @@ -112,7 +116,7 @@ def tearpage(filename, bibtex=None, force=False): """ # Fetch pages to tear pages_to_tear = [] - if force is not False: + if force is not None: pages_to_tear = force elif bibtex is not None: pages_to_tear = tearpage_needed(bibtex) diff --git a/libbmc/repositories/__init__.py b/libbmc/repositories/__init__.py index c0839ee..e69de29 100644 --- a/libbmc/repositories/__init__.py +++ b/libbmc/repositories/__init__.py @@ -1,5 +0,0 @@ -from libbmc.repositories import arxiv, hal - -__all__ = [ - "arxiv", "hal" -] diff --git a/libbmc/repositories/arxiv.py b/libbmc/repositories/arxiv.py index 335f480..38b5eee 100644 --- a/libbmc/repositories/arxiv.py +++ b/libbmc/repositories/arxiv.py @@ -1,15 +1,18 @@ """ This file contains all the arXiv-related functions. """ -import arxiv2bib -import bibtexparser import io import re -import requests import tarfile import xml.etree.ElementTree from urllib.error import HTTPError + + +import arxiv2bib +import bibtexparser +import requests + from requests.exceptions import RequestException @@ -268,7 +271,7 @@ def is_valid(arxiv_id): False """ match = REGEX.match(arxiv_id) - return ((match is not None) and (match.group(0) == arxiv_id)) + return (match is not None) and (match.group(0) == arxiv_id) def get_bibtex(arxiv_id): @@ -320,17 +323,17 @@ def extract_from_text(text): for i in REGEX.findall(text) if i[0] != '']) -def to_URL(arxiv_ids): +def to_url(arxiv_ids): """ Convert a list of canonical DOIs to a list of DOIs URLs. :param dois: List of canonical DOIs. :returns: A list of DOIs URLs. - >>> to_URL('1506.06690') + >>> to_url('1506.06690') 'http://arxiv.org/abs/1506.06690' - >>> to_URL('1506.06690v1') + >>> to_url('1506.06690v1') 'http://arxiv.org/abs/1506.06690v1' """ if isinstance(arxiv_ids, list): @@ -358,16 +361,10 @@ def to_canonical(urls): >>> to_canonical('aaa') is None True """ - try: - if isinstance(urls, list): - return [next(iter(extract_from_text(url))) for url in urls] - else: - return next(iter(extract_from_text(urls))) - except StopIteration: - return None + return tools.map_or_apply(extract_from_text, urls) -def from_DOI(doi): +def from_doi(doi): """ Get the arXiv eprint id for a given DOI. @@ -379,29 +376,29 @@ def from_DOI(doi): :param doi: The DOI of the resource to look for. :returns: The arXiv eprint id, or ``None`` if not found. - >>> from_DOI('10.1209/0295-5075/111/40005') + >>> from_doi('10.1209/0295-5075/111/40005') # Note: Test do not pass due to an arXiv API bug. '1506.06690' """ try: - r = requests.get("http://export.arxiv.org/api/query", - params={ - "search_query": "doi:%s" % (doi,), - "max_results": 1 - }) - r.raise_for_status() + request = requests.get("http://export.arxiv.org/api/query", + params={ + "search_query": "doi:%s" % (doi,), + "max_results": 1 + }) + request.raise_for_status() except RequestException: return None - e = xml.etree.ElementTree.fromstring(r.content) - for entry in e.iter("{http://www.w3.org/2005/Atom}entry"): - id = entry.find("{http://www.w3.org/2005/Atom}id").text - # id is an arXiv full URL. We only want the id which is the last URL - # component. - return id.split("/")[-1] + root = xml.etree.ElementTree.fromstring(request.content) + for entry in root.iter("{http://www.w3.org/2005/Atom}entry"): + arxiv_id = entry.find("{http://www.w3.org/2005/Atom}id").text + # arxiv_id is an arXiv full URL. We only want the id which is the last + # URL component. + return arxiv_id.split("/")[-1] return None -def to_DOI(arxiv_id): +def to_doi(arxiv_id): """ Get the associated DOI for a given arXiv eprint. @@ -413,23 +410,23 @@ def to_DOI(arxiv_id): :param eprint: The arXiv eprint id. :returns: The DOI if any, or ``None``. - >>> to_DOI('1506.06690v1') + >>> to_doi('1506.06690v1') '10.1209/0295-5075/111/40005' - >>> to_DOI('1506.06690') + >>> to_doi('1506.06690') '10.1209/0295-5075/111/40005' """ try: - r = requests.get("http://export.arxiv.org/api/query", - params={ - "id_list": arxiv_id, - "max_results": 1 - }) - r.raise_for_status() + request = requests.get("http://export.arxiv.org/api/query", + params={ + "id_list": arxiv_id, + "max_results": 1 + }) + request.raise_for_status() except RequestException: return None - e = xml.etree.ElementTree.fromstring(r.content) - for entry in e.iter("{http://www.w3.org/2005/Atom}entry"): + root = xml.etree.ElementTree.fromstring(request.content) + for entry in root.iter("{http://www.w3.org/2005/Atom}entry"): doi = entry.find("{http://arxiv.org/schemas/atom}doi") if doi is not None: return doi.text @@ -451,9 +448,9 @@ def get_sources(arxiv_id): ``None``. """ try: - r = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id)) - r.raise_for_status() - file_object = io.BytesIO(r.content) + request = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id)) + request.raise_for_status() + file_object = io.BytesIO(request.content) return tarfile.open(fileobj=file_object) except (RequestException, AssertionError, tarfile.TarError): return None @@ -473,9 +470,9 @@ def get_bbl(arxiv_id): :returns: A list of the full text of the ``.bbl`` files (if any) \ or ``None``. """ - tf = get_sources(arxiv_id) - bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")] - bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING) + tar_file = get_sources(arxiv_id) + bbl_files = [i for i in tar_file.getmembers() if i.name.endswith(".bbl")] + bbl_files = [tar_file.extractfile(member).read().decode(tarfile.ENCODING) for member in bbl_files] return bbl_files @@ -498,5 +495,5 @@ def get_citations(arxiv_id): bbl_files = get_bbl(arxiv_id) for bbl_file in bbl_files: # Fetch the cited DOIs for each of the bbl files - dois.update(bbl.get_cited_DOIs(bbl_file)) + dois.update(bbl.get_cited_dois(bbl_file)) return dois diff --git a/libbmc/repositories/hal.py b/libbmc/repositories/hal.py index baf3a97..5752dc0 100644 --- a/libbmc/repositories/hal.py +++ b/libbmc/repositories/hal.py @@ -33,7 +33,7 @@ def is_valid(hal_id): False """ match = REGEX.match(hal_id) - return ((match is not None) and (match.group(0) == hal_id)) + return (match is not None) and (match.group(0) == hal_id) def extract_from_text(text): diff --git a/libbmc/tools.py b/libbmc/tools.py index b278700..f35b654 100644 --- a/libbmc/tools.py +++ b/libbmc/tools.py @@ -9,9 +9,11 @@ from itertools import islice, chain # Huge URL regex taken from https://gist.github.com/gruber/8891611 URL_REGEX = re.compile(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?>> replaceAll("foo bar foo thing", {"foo": "oof", "bar": "rab"}) + >>> replace_all("foo bar foo thing", {"foo": "oof", "bar": "rab"}) 'oof rab oof thing' """ for i, j in replace_dict.items(): @@ -34,6 +36,24 @@ def replaceAll(text, replace_dict): return text +def map_or_apply(function, param): + """ + Map the function on ``param``, or apply it, depending whether ``param`` \ + is a list or an item. + + :param function: The function to apply. + :param param: The parameter to feed the function with (list or item). + :returns: The computed value or ``None``. + """ + try: + if isinstance(param, list): + return [next(iter(function(i))) for i in param] + else: + return next(iter(function(param))) + except StopIteration: + return None + + def clean_whitespaces(text): """ Remove multiple whitespaces from text. Also removes leading and trailing \ @@ -85,13 +105,13 @@ def batch(iterable, size): >>> [list(i) for i in batch([1, 2, 3, 4, 5], 2)] [[1, 2], [3, 4], [5]] """ - it = iter(iterable) + item = iter(iterable) while True: - bi = islice(it, size) - yield chain([next(bi)], bi) + batch_iterator = islice(item, size) + yield chain([next(batch_iterator)], batch_iterator) -def remove_URLs(text): +def remove_urls(text): """ Remove URLs from a given text (only removes http, https and naked domains \ URLs). @@ -99,16 +119,12 @@ def remove_URLs(text): :param text: The text to remove URLs from. :returns: The text without URLs. - >>> remove_URLs("foobar http://example.com https://example.com foobar") + >>> remove_urls("foobar http://example.com https://example.com foobar") 'foobar foobar' """ return clean_whitespaces(URL_REGEX.sub("", text)) -_slugify_strip_re = re.compile(r'[^\w\s-]') -_slugify_hyphenate_re = re.compile(r'[\s]+') - - def slugify(value): """ Normalizes string, converts to lowercase, removes non-alpha characters, @@ -127,5 +143,5 @@ def slugify(value): value = unicode_type(value) value = (unicodedata.normalize('NFKD', value). encode('ascii', 'ignore').decode('ascii')) - value = unicode_type(_slugify_strip_re.sub('', value).strip()) - return _slugify_hyphenate_re.sub('_', value) + value = unicode_type(_SLUGIFY_STRIP_RE.sub('', value).strip()) + return _SLUGIFY_HYPHENATE_RE.sub('_', value)