First commit

This commit is contained in:
Lucas Verney 2015-12-27 19:35:55 +01:00
commit 97eb5a3ae0
10 changed files with 532 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
__pycache__

6
.gitmodules vendored Normal file
View File

@ -0,0 +1,6 @@
[submodule "libbmc/external/opendetex"]
path = libbmc/external/opendetex
url = https://github.com/Phyks/opendetex
[submodule "libbmc/external/poppler"]
path = libbmc/external/poppler
url = git://git.freedesktop.org/git/poppler/poppler

0
libbmc/__init__.py Normal file
View File

113
libbmc/doi.py Normal file
View File

@ -0,0 +1,113 @@
"""
This file contains all the DOI-related functions.
"""
import re
import requests
from libbmc import tools
# Taken from
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
regex = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b",
re.IGNORECASE)
# Base dx.doi.org URL for redirections
dx_url = "http://dx.doi.org/{doi}"
def is_valid(doi):
"""
Check that a given DOI is a valid canonical DOI.
:param doi: The DOI to be checked.
:returns: Boolean indicating whether the DOI is valid or not.
"""
match = regex.match(doi)
return ((match is not None) and (match.group(0) == doi))
def extract_from_text(text):
"""
Extract canonical DOIs from a text.
:param text: The text to extract DOIs from.
:returns: A list of found DOIs.
"""
return tools.remove_duplicates(regex.findall(text))
def to_URL(dois):
"""
Convert a list of canonical DOIs to a list of DOIs URLs.
:param dois: List of canonical DOIs.
:returns: A list of DOIs URLs.
"""
if isinstance(dois, list):
return [dx_url.format(doi=doi) for doi in dois]
else:
return dx_url.format(doi=dois)
def to_canonical(urls):
"""
Convert a list of DOIs URLs to a list of canonical DOIs.
:param dois: A list of DOIs URLs.
:returns: List of canonical DOIs.
"""
if isinstance(urls, list):
return [extract_from_text(url) for url in urls]
else:
return extract_from_text(urls)
def get_oa_version(doi):
"""
Get an OA version for a given DOI.
.. note::
Uses beta.dissem.in API.
:param doi: A canonical DOI.
:returns: The URL of the OA version of the given DOI, or ``None``.
"""
# If DOI is a link, truncate it
r = requests.get("http://beta.dissem.in/api/%s" % (doi,))
try:
assert(r.status_code == requests.codes.ok)
result = r.json()
assert(result["status"] == "ok")
return result["paper"]["pdf_url"]
except (AssertionError, ValueError, KeyError):
return None
def get_linked_version(doi):
"""
Get the original link behind the DOI.
:param doi: A canonical DOI.
:returns: The canonical URL behind the DOI, or ``None``.
"""
r = requests.head(to_URL(doi))
return r.headers.get("location")
def get_bibtex(doi):
"""
Get a BibTeX entry for a given DOI.
.. note::
Adapted from https://gist.github.com/jrsmith3/5513926.
:param doi: The canonical DOI to get BibTeX from.
:returns: A BibTeX string or ``None``.
"""
r = requests.get(to_URL(doi),
headers={"accept": "application/x-bibtex"})
if r.headers.get("content-type") == "application/x-bibtex":
return r.text
else:
return None

1
libbmc/external/opendetex vendored Submodule

@ -0,0 +1 @@
Subproject commit b980e6764279df32acd0f693a163d2040f1166b7

1
libbmc/external/poppler vendored Submodule

@ -0,0 +1 @@
Subproject commit b3425dd3261679958cd56c0f71995c15d2124433

55
libbmc/isbn.py Normal file
View File

@ -0,0 +1,55 @@
"""
This file contains all the ISBN-related functions.
"""
import isbnlib
from libbmc import doi
def is_valid(isbn):
"""
Check that a given string is a valid ISBN.
:param isbn: the isbn to be checked.
:returns: boolean indicating whether the isbn is valid or not.
"""
return not isbnlib.notisbn(isbn)
def extract_from_text(text):
"""
Extract ISBNs from a text.
:param text: Some text.
:returns: A list of canonical ISBNs found in the text.
"""
return [isbnlib.get_canonical_isbn(isbn)
for isbn in isbnlib.get_isbnlike(text)]
def get_bibtex(isbn):
"""
Get a BibTeX string for the given ISBN.
:param isbn: ISBN to fetch BibTeX entry for.
:returns: A BibTeX string.
"""
return doi.get_bibtex(to_doi(isbn))
def to_doi(isbn):
"""
Try to fetch a DOI from a given ISBN.
:param isbn: A valid ISBN string.
:returns: A DOI as string.
"""
return isbnlib.doi(isbn)
def from_doi(doi):
"""
TODO
"""
assert(False)

View File

@ -0,0 +1,316 @@
"""
This file contains all the arXiv-related functions.
"""
import io
import re
import requests
import tarfile
import xml.etree.ElementTree
from libbmc import tools
arxiv_identifier_from_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
arxiv_identifier_before_2007 = r"(" + ("|".join([
"astro-ph.GA",
"astro-ph.CO",
"astro-ph.EP",
"astro-ph.HE",
"astro-ph.IM",
"astro-ph.SR",
"cond-math.dis-nn",
"cond-math.mtrl-sci",
"cond-math.mes-hall",
"cond-math.other",
"cond-math.quant-gas",
"cond-math.soft",
"cond-math.stat-mech",
"cond-math.str-el",
"cond-math.supr-con",
"gr-qc",
"hep-ex",
"hep-lat",
"hep-ph",
"hep-th",
"math-ph",
"nlin.AO",
"nlin.CG",
"nlin.CD",
"nlin.SI",
"nlin.PS",
"nucl-ex",
"nucl-th",
"physics.acc-ph",
"physics.ao-ph",
"physics.atom-ph",
"physics.atm-clus",
"physics.bio-ph",
"physics.chem-ph",
"physics.class-ph",
"physics.comp-ph",
"physics.data-an",
"physics.flu-dyn",
"physics.gen-ph",
"physics.geo-ph",
"physics.hist-ph",
"physics.ins-det",
"physics.med-ph",
"physics.optics",
"physics.ed-ph",
"physics.soc-ph",
"physics.plasm-ph",
"physics.pop-ph",
"physics.space-ph",
"physics.quant-ph",
"math.AG",
"math.AT",
"math.AP",
"math.CT",
"math.CA",
"math.CO",
"math.AC",
"math.CV",
"math.DG",
"math.DS",
"math.FA",
"math.GM",
"math.GN",
"math.GT",
"math.GR",
"math.HO",
"math.IT",
"math.KT",
"math.LO",
"math.MP",
"math.MG",
"math.NT",
"math.NA",
"math.OA",
"math.OC",
"math.PR",
"math.QA",
"math.RT",
"math.RA",
"math.SP",
"math.ST",
"math.SG",
"cs.AI",
"cs.CL",
"cs.CC",
"cs.CE",
"cs.CG",
"cs.GT",
"cs.CV",
"cs.CY",
"cs.CR",
"cs.DS",
"cs.DB",
"cs.DL",
"cs.DM",
"cs.DC",
"cs.ET",
"cs.FL",
"cs.GL",
"cs.GR",
"cs.AR",
"cs.HC",
"cs.IR",
"cs.IT",
"cs.LG",
"cs.LO",
"cs.MS",
"cs.MA",
"cs.MM",
"cs.NI",
"cs.NE",
"cs.NA",
"cs.OS",
"cs.OH",
"cs.PF",
"cs.PL",
"cs.RO",
"cs.SI",
"cs.SE",
"cs.SD",
"cs.SC",
"cs.SY",
"q-bio.BM",
"q-bio.CB",
"q-bio.GN",
"q-bio.MN",
"q-bio.NC",
"q-bio.OT",
"q-bio.PE",
"q-bio.QM",
"q-bio.SC",
"q-bio.TO",
"q-fin.CP",
"q-fin.EC",
"q-fin.GN",
"q-fin.MF",
"q-fin.PM",
"q-fin.PR",
"q-fin.RM",
"q-fin.ST",
"q-fin.TR",
"stat.AP",
"stat.CO",
"stat.ML",
"stat.ME",
"stat.OT",
"stat.TH"])) + r")/\d+"
regex = re.compile(
"(" + arxiv_identifier_from_2007 + ")|(" +
arxiv_identifier_before_2007 + ")",
re.IGNORECASE)
# Base arXiv URL used as id sometimes
arxiv_url = "http://arxiv.org/abs/{arxiv_id}"
# Eprint URL used to download sources
arxiv_eprint_url = "http://arxiv.org/e-print/{arxiv_id}"
def is_valid(arxiv_id):
"""
Check that a given arXiv ID is a valid one.
:param arxiv_id: The arXiv ID to be checked.
:returns: Boolean indicating whether the arXiv ID is valid or not.
"""
match = regex.match(arxiv_id)
return ((match is not None) and (match.group(0) == arxiv_id))
def get_bibtex(arxiv_id):
"""
TODO
"""
assert(False)
def extract_from_text(text):
"""
Extract arXiv IDs from a text.
:param text: The text to extract arXiv IDs from.
:returns: A list of matching arXiv IDs.
"""
return tools.remove_duplicates(regex.findall(text))
def to_URL(arxiv_ids):
"""
Convert a list of canonical DOIs to a list of DOIs URLs.
:param dois: List of canonical DOIs.
:returns: A list of DOIs URLs.
"""
if isinstance(arxiv_ids, list):
return [arxiv_url.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]
else:
return arxiv_url.format(arxiv_id=arxiv_ids)
def to_canonical(urls):
"""
Convert a list of DOIs URLs to a list of canonical DOIs.
:param dois: A list of DOIs URLs.
:returns: List of canonical DOIs.
"""
if isinstance(urls, list):
return [extract_from_text(url) for url in urls]
else:
return extract_from_text(urls)
def from_doi(doi):
"""
Get the arXiv eprint id for a given DOI.
.. note::
Uses arXiv API. Will not return anything if arXiv is not aware of the
associated DOI.
:param doi: The DOI of the resource to look for.
:returns: The arXiv eprint id, or ``None`` if not found.
"""
r = requests.get("http://export.arxiv.org/api/query",
params={
"search_query": "doi:%s" % (doi,),
"max_results": 1
})
e = xml.etree.ElementTree.fromstring(r.content)
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
id = entry.find("{http://www.w3.org/2005/Atom}id").text
# id is an arXiv full URL. We only want the id which is the last URL
# component.
return id.split("/")[-1]
return None
def to_doi(arxiv_id):
"""
Get the associated DOI for a given arXiv eprint.
.. note::
Uses arXiv API. Will not return anything if arXiv is not aware of the
associated DOI.
:param eprint: The arXiv eprint id.
:returns: The DOI if any, or ``None``.
"""
r = requests.get("http://export.arxiv.org/api/query",
params={
"id_list": arxiv_id,
"max_results": 1
})
e = xml.etree.ElementTree.fromstring(r.content)
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
doi = entry.find("{http://arxiv.org/schemas/atom}doi")
if doi is not None:
return doi.text
return None
def get_sources(arxiv_id):
"""
Download sources on arXiv for a given preprint.
:param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \
canonical form.
:returns: A ``TarFile`` object of the sources of the arXiv preprint or \
``None``.
"""
r = requests.get(arxiv_eprint_url.format(arxiv_id=arxiv_id))
try:
assert(r.status_code == requests.codes.ok)
file_object = io.BytesIO(r.content)
return tarfile.open(fileobj=file_object)
except (AssertionError, tarfile.TarError):
return None
def get_bbl(arxiv_id):
"""
Get the .bbl files (if any) of a given preprint.
:param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \
canonical form.
:returns: A list of the full text of the ``.bbl`` files (if any) \
or ``None``.
"""
tf = get_sources(arxiv_id)
bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
for member in bbl_files]
return bbl_files
def get_citations(arxiv_id):
"""
TODO
"""
assert(False)

37
libbmc/tools.py Normal file
View File

@ -0,0 +1,37 @@
"""
This file contains various utility functions.
"""
def replaceAll(text, replace_dict):
"""
Replace multiple strings in a text.
:param text: Text to replace in.
:param replace_dict: Dictionary mapping strings to replace with their \
substitution.
:returns: Text after replacements.
"""
for i, j in replace_dict.items():
text = text.replace(i, j)
return text
def clean_whitespaces(text):
"""
Remove multiple whitespaces from text.
:param text: Text to remove multiple whitespaces from.
:returns: A cleaned text.
"""
return ' '.join(text.strip().split())
def remove_duplicates(some_list):
"""
Remove the duplicates from a list.
:param some_list: List to remove duplicates from.
:returns: A list without duplicates.
"""
return list(set(some_list))

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
isbnlib==3.5.7
requests==2.9.1