First commit
This commit is contained in:
commit
97eb5a3ae0
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
__pycache__
|
6
.gitmodules
vendored
Normal file
6
.gitmodules
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
[submodule "libbmc/external/opendetex"]
|
||||
path = libbmc/external/opendetex
|
||||
url = https://github.com/Phyks/opendetex
|
||||
[submodule "libbmc/external/poppler"]
|
||||
path = libbmc/external/poppler
|
||||
url = git://git.freedesktop.org/git/poppler/poppler
|
0
libbmc/__init__.py
Normal file
0
libbmc/__init__.py
Normal file
113
libbmc/doi.py
Normal file
113
libbmc/doi.py
Normal file
@ -0,0 +1,113 @@
|
||||
"""
|
||||
This file contains all the DOI-related functions.
|
||||
"""
|
||||
import re
|
||||
import requests
|
||||
|
||||
from libbmc import tools
|
||||
|
||||
# Taken from
|
||||
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
|
||||
regex = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b",
|
||||
re.IGNORECASE)
|
||||
# Base dx.doi.org URL for redirections
|
||||
dx_url = "http://dx.doi.org/{doi}"
|
||||
|
||||
|
||||
def is_valid(doi):
|
||||
"""
|
||||
Check that a given DOI is a valid canonical DOI.
|
||||
|
||||
:param doi: The DOI to be checked.
|
||||
:returns: Boolean indicating whether the DOI is valid or not.
|
||||
"""
|
||||
match = regex.match(doi)
|
||||
return ((match is not None) and (match.group(0) == doi))
|
||||
|
||||
|
||||
def extract_from_text(text):
|
||||
"""
|
||||
Extract canonical DOIs from a text.
|
||||
|
||||
:param text: The text to extract DOIs from.
|
||||
:returns: A list of found DOIs.
|
||||
"""
|
||||
return tools.remove_duplicates(regex.findall(text))
|
||||
|
||||
|
||||
def to_URL(dois):
|
||||
"""
|
||||
Convert a list of canonical DOIs to a list of DOIs URLs.
|
||||
|
||||
:param dois: List of canonical DOIs.
|
||||
:returns: A list of DOIs URLs.
|
||||
"""
|
||||
if isinstance(dois, list):
|
||||
return [dx_url.format(doi=doi) for doi in dois]
|
||||
else:
|
||||
return dx_url.format(doi=dois)
|
||||
|
||||
|
||||
def to_canonical(urls):
|
||||
"""
|
||||
Convert a list of DOIs URLs to a list of canonical DOIs.
|
||||
|
||||
:param dois: A list of DOIs URLs.
|
||||
:returns: List of canonical DOIs.
|
||||
"""
|
||||
if isinstance(urls, list):
|
||||
return [extract_from_text(url) for url in urls]
|
||||
else:
|
||||
return extract_from_text(urls)
|
||||
|
||||
|
||||
def get_oa_version(doi):
|
||||
"""
|
||||
Get an OA version for a given DOI.
|
||||
|
||||
.. note::
|
||||
|
||||
Uses beta.dissem.in API.
|
||||
|
||||
:param doi: A canonical DOI.
|
||||
:returns: The URL of the OA version of the given DOI, or ``None``.
|
||||
"""
|
||||
# If DOI is a link, truncate it
|
||||
r = requests.get("http://beta.dissem.in/api/%s" % (doi,))
|
||||
try:
|
||||
assert(r.status_code == requests.codes.ok)
|
||||
result = r.json()
|
||||
assert(result["status"] == "ok")
|
||||
return result["paper"]["pdf_url"]
|
||||
except (AssertionError, ValueError, KeyError):
|
||||
return None
|
||||
|
||||
|
||||
def get_linked_version(doi):
|
||||
"""
|
||||
Get the original link behind the DOI.
|
||||
|
||||
:param doi: A canonical DOI.
|
||||
:returns: The canonical URL behind the DOI, or ``None``.
|
||||
"""
|
||||
r = requests.head(to_URL(doi))
|
||||
return r.headers.get("location")
|
||||
|
||||
|
||||
def get_bibtex(doi):
|
||||
"""
|
||||
Get a BibTeX entry for a given DOI.
|
||||
|
||||
.. note::
|
||||
|
||||
Adapted from https://gist.github.com/jrsmith3/5513926.
|
||||
|
||||
:param doi: The canonical DOI to get BibTeX from.
|
||||
:returns: A BibTeX string or ``None``.
|
||||
"""
|
||||
r = requests.get(to_URL(doi),
|
||||
headers={"accept": "application/x-bibtex"})
|
||||
if r.headers.get("content-type") == "application/x-bibtex":
|
||||
return r.text
|
||||
else:
|
||||
return None
|
1
libbmc/external/opendetex
vendored
Submodule
1
libbmc/external/opendetex
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit b980e6764279df32acd0f693a163d2040f1166b7
|
1
libbmc/external/poppler
vendored
Submodule
1
libbmc/external/poppler
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit b3425dd3261679958cd56c0f71995c15d2124433
|
55
libbmc/isbn.py
Normal file
55
libbmc/isbn.py
Normal file
@ -0,0 +1,55 @@
|
||||
"""
|
||||
This file contains all the ISBN-related functions.
|
||||
"""
|
||||
import isbnlib
|
||||
|
||||
from libbmc import doi
|
||||
|
||||
|
||||
def is_valid(isbn):
|
||||
"""
|
||||
Check that a given string is a valid ISBN.
|
||||
|
||||
:param isbn: the isbn to be checked.
|
||||
:returns: boolean indicating whether the isbn is valid or not.
|
||||
|
||||
"""
|
||||
return not isbnlib.notisbn(isbn)
|
||||
|
||||
|
||||
def extract_from_text(text):
|
||||
"""
|
||||
Extract ISBNs from a text.
|
||||
|
||||
:param text: Some text.
|
||||
:returns: A list of canonical ISBNs found in the text.
|
||||
"""
|
||||
return [isbnlib.get_canonical_isbn(isbn)
|
||||
for isbn in isbnlib.get_isbnlike(text)]
|
||||
|
||||
|
||||
def get_bibtex(isbn):
|
||||
"""
|
||||
Get a BibTeX string for the given ISBN.
|
||||
|
||||
:param isbn: ISBN to fetch BibTeX entry for.
|
||||
:returns: A BibTeX string.
|
||||
"""
|
||||
return doi.get_bibtex(to_doi(isbn))
|
||||
|
||||
|
||||
def to_doi(isbn):
|
||||
"""
|
||||
Try to fetch a DOI from a given ISBN.
|
||||
|
||||
:param isbn: A valid ISBN string.
|
||||
:returns: A DOI as string.
|
||||
"""
|
||||
return isbnlib.doi(isbn)
|
||||
|
||||
|
||||
def from_doi(doi):
|
||||
"""
|
||||
TODO
|
||||
"""
|
||||
assert(False)
|
316
libbmc/repositories/arxiv.py
Normal file
316
libbmc/repositories/arxiv.py
Normal file
@ -0,0 +1,316 @@
|
||||
"""
|
||||
This file contains all the arXiv-related functions.
|
||||
"""
|
||||
import io
|
||||
import re
|
||||
import requests
|
||||
import tarfile
|
||||
import xml.etree.ElementTree
|
||||
|
||||
from libbmc import tools
|
||||
|
||||
|
||||
arxiv_identifier_from_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
|
||||
arxiv_identifier_before_2007 = r"(" + ("|".join([
|
||||
"astro-ph.GA",
|
||||
"astro-ph.CO",
|
||||
"astro-ph.EP",
|
||||
"astro-ph.HE",
|
||||
"astro-ph.IM",
|
||||
"astro-ph.SR",
|
||||
"cond-math.dis-nn",
|
||||
"cond-math.mtrl-sci",
|
||||
"cond-math.mes-hall",
|
||||
"cond-math.other",
|
||||
"cond-math.quant-gas",
|
||||
"cond-math.soft",
|
||||
"cond-math.stat-mech",
|
||||
"cond-math.str-el",
|
||||
"cond-math.supr-con",
|
||||
"gr-qc",
|
||||
"hep-ex",
|
||||
"hep-lat",
|
||||
"hep-ph",
|
||||
"hep-th",
|
||||
"math-ph",
|
||||
"nlin.AO",
|
||||
"nlin.CG",
|
||||
"nlin.CD",
|
||||
"nlin.SI",
|
||||
"nlin.PS",
|
||||
"nucl-ex",
|
||||
"nucl-th",
|
||||
"physics.acc-ph",
|
||||
"physics.ao-ph",
|
||||
"physics.atom-ph",
|
||||
"physics.atm-clus",
|
||||
"physics.bio-ph",
|
||||
"physics.chem-ph",
|
||||
"physics.class-ph",
|
||||
"physics.comp-ph",
|
||||
"physics.data-an",
|
||||
"physics.flu-dyn",
|
||||
"physics.gen-ph",
|
||||
"physics.geo-ph",
|
||||
"physics.hist-ph",
|
||||
"physics.ins-det",
|
||||
"physics.med-ph",
|
||||
"physics.optics",
|
||||
"physics.ed-ph",
|
||||
"physics.soc-ph",
|
||||
"physics.plasm-ph",
|
||||
"physics.pop-ph",
|
||||
"physics.space-ph",
|
||||
"physics.quant-ph",
|
||||
"math.AG",
|
||||
"math.AT",
|
||||
"math.AP",
|
||||
"math.CT",
|
||||
"math.CA",
|
||||
"math.CO",
|
||||
"math.AC",
|
||||
"math.CV",
|
||||
"math.DG",
|
||||
"math.DS",
|
||||
"math.FA",
|
||||
"math.GM",
|
||||
"math.GN",
|
||||
"math.GT",
|
||||
"math.GR",
|
||||
"math.HO",
|
||||
"math.IT",
|
||||
"math.KT",
|
||||
"math.LO",
|
||||
"math.MP",
|
||||
"math.MG",
|
||||
"math.NT",
|
||||
"math.NA",
|
||||
"math.OA",
|
||||
"math.OC",
|
||||
"math.PR",
|
||||
"math.QA",
|
||||
"math.RT",
|
||||
"math.RA",
|
||||
"math.SP",
|
||||
"math.ST",
|
||||
"math.SG",
|
||||
"cs.AI",
|
||||
"cs.CL",
|
||||
"cs.CC",
|
||||
"cs.CE",
|
||||
"cs.CG",
|
||||
"cs.GT",
|
||||
"cs.CV",
|
||||
"cs.CY",
|
||||
"cs.CR",
|
||||
"cs.DS",
|
||||
"cs.DB",
|
||||
"cs.DL",
|
||||
"cs.DM",
|
||||
"cs.DC",
|
||||
"cs.ET",
|
||||
"cs.FL",
|
||||
"cs.GL",
|
||||
"cs.GR",
|
||||
"cs.AR",
|
||||
"cs.HC",
|
||||
"cs.IR",
|
||||
"cs.IT",
|
||||
"cs.LG",
|
||||
"cs.LO",
|
||||
"cs.MS",
|
||||
"cs.MA",
|
||||
"cs.MM",
|
||||
"cs.NI",
|
||||
"cs.NE",
|
||||
"cs.NA",
|
||||
"cs.OS",
|
||||
"cs.OH",
|
||||
"cs.PF",
|
||||
"cs.PL",
|
||||
"cs.RO",
|
||||
"cs.SI",
|
||||
"cs.SE",
|
||||
"cs.SD",
|
||||
"cs.SC",
|
||||
"cs.SY",
|
||||
"q-bio.BM",
|
||||
"q-bio.CB",
|
||||
"q-bio.GN",
|
||||
"q-bio.MN",
|
||||
"q-bio.NC",
|
||||
"q-bio.OT",
|
||||
"q-bio.PE",
|
||||
"q-bio.QM",
|
||||
"q-bio.SC",
|
||||
"q-bio.TO",
|
||||
"q-fin.CP",
|
||||
"q-fin.EC",
|
||||
"q-fin.GN",
|
||||
"q-fin.MF",
|
||||
"q-fin.PM",
|
||||
"q-fin.PR",
|
||||
"q-fin.RM",
|
||||
"q-fin.ST",
|
||||
"q-fin.TR",
|
||||
"stat.AP",
|
||||
"stat.CO",
|
||||
"stat.ML",
|
||||
"stat.ME",
|
||||
"stat.OT",
|
||||
"stat.TH"])) + r")/\d+"
|
||||
regex = re.compile(
|
||||
"(" + arxiv_identifier_from_2007 + ")|(" +
|
||||
arxiv_identifier_before_2007 + ")",
|
||||
re.IGNORECASE)
|
||||
|
||||
# Base arXiv URL used as id sometimes
|
||||
arxiv_url = "http://arxiv.org/abs/{arxiv_id}"
|
||||
# Eprint URL used to download sources
|
||||
arxiv_eprint_url = "http://arxiv.org/e-print/{arxiv_id}"
|
||||
|
||||
|
||||
def is_valid(arxiv_id):
|
||||
"""
|
||||
Check that a given arXiv ID is a valid one.
|
||||
|
||||
:param arxiv_id: The arXiv ID to be checked.
|
||||
:returns: Boolean indicating whether the arXiv ID is valid or not.
|
||||
"""
|
||||
match = regex.match(arxiv_id)
|
||||
return ((match is not None) and (match.group(0) == arxiv_id))
|
||||
|
||||
|
||||
def get_bibtex(arxiv_id):
|
||||
"""
|
||||
TODO
|
||||
"""
|
||||
assert(False)
|
||||
|
||||
|
||||
def extract_from_text(text):
|
||||
"""
|
||||
Extract arXiv IDs from a text.
|
||||
|
||||
:param text: The text to extract arXiv IDs from.
|
||||
:returns: A list of matching arXiv IDs.
|
||||
"""
|
||||
return tools.remove_duplicates(regex.findall(text))
|
||||
|
||||
|
||||
def to_URL(arxiv_ids):
|
||||
"""
|
||||
Convert a list of canonical DOIs to a list of DOIs URLs.
|
||||
|
||||
:param dois: List of canonical DOIs.
|
||||
:returns: A list of DOIs URLs.
|
||||
"""
|
||||
if isinstance(arxiv_ids, list):
|
||||
return [arxiv_url.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]
|
||||
else:
|
||||
return arxiv_url.format(arxiv_id=arxiv_ids)
|
||||
|
||||
|
||||
def to_canonical(urls):
|
||||
"""
|
||||
Convert a list of DOIs URLs to a list of canonical DOIs.
|
||||
|
||||
:param dois: A list of DOIs URLs.
|
||||
:returns: List of canonical DOIs.
|
||||
"""
|
||||
if isinstance(urls, list):
|
||||
return [extract_from_text(url) for url in urls]
|
||||
else:
|
||||
return extract_from_text(urls)
|
||||
|
||||
|
||||
def from_doi(doi):
|
||||
"""
|
||||
Get the arXiv eprint id for a given DOI.
|
||||
|
||||
.. note::
|
||||
|
||||
Uses arXiv API. Will not return anything if arXiv is not aware of the
|
||||
associated DOI.
|
||||
|
||||
:param doi: The DOI of the resource to look for.
|
||||
:returns: The arXiv eprint id, or ``None`` if not found.
|
||||
"""
|
||||
r = requests.get("http://export.arxiv.org/api/query",
|
||||
params={
|
||||
"search_query": "doi:%s" % (doi,),
|
||||
"max_results": 1
|
||||
})
|
||||
e = xml.etree.ElementTree.fromstring(r.content)
|
||||
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
||||
id = entry.find("{http://www.w3.org/2005/Atom}id").text
|
||||
# id is an arXiv full URL. We only want the id which is the last URL
|
||||
# component.
|
||||
return id.split("/")[-1]
|
||||
return None
|
||||
|
||||
|
||||
def to_doi(arxiv_id):
|
||||
"""
|
||||
Get the associated DOI for a given arXiv eprint.
|
||||
|
||||
.. note::
|
||||
|
||||
Uses arXiv API. Will not return anything if arXiv is not aware of the
|
||||
associated DOI.
|
||||
|
||||
:param eprint: The arXiv eprint id.
|
||||
:returns: The DOI if any, or ``None``.
|
||||
"""
|
||||
r = requests.get("http://export.arxiv.org/api/query",
|
||||
params={
|
||||
"id_list": arxiv_id,
|
||||
"max_results": 1
|
||||
})
|
||||
e = xml.etree.ElementTree.fromstring(r.content)
|
||||
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
||||
doi = entry.find("{http://arxiv.org/schemas/atom}doi")
|
||||
if doi is not None:
|
||||
return doi.text
|
||||
return None
|
||||
|
||||
|
||||
def get_sources(arxiv_id):
|
||||
"""
|
||||
Download sources on arXiv for a given preprint.
|
||||
|
||||
:param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \
|
||||
canonical form.
|
||||
:returns: A ``TarFile`` object of the sources of the arXiv preprint or \
|
||||
``None``.
|
||||
"""
|
||||
r = requests.get(arxiv_eprint_url.format(arxiv_id=arxiv_id))
|
||||
try:
|
||||
assert(r.status_code == requests.codes.ok)
|
||||
file_object = io.BytesIO(r.content)
|
||||
return tarfile.open(fileobj=file_object)
|
||||
except (AssertionError, tarfile.TarError):
|
||||
return None
|
||||
|
||||
|
||||
def get_bbl(arxiv_id):
|
||||
"""
|
||||
Get the .bbl files (if any) of a given preprint.
|
||||
|
||||
:param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \
|
||||
canonical form.
|
||||
:returns: A list of the full text of the ``.bbl`` files (if any) \
|
||||
or ``None``.
|
||||
"""
|
||||
tf = get_sources(arxiv_id)
|
||||
bbl_files = [i for i in tf.getmembers() if i.name.endswith(".bbl")]
|
||||
bbl_files = [tf.extractfile(member).read().decode(tarfile.ENCODING)
|
||||
for member in bbl_files]
|
||||
return bbl_files
|
||||
|
||||
|
||||
def get_citations(arxiv_id):
|
||||
"""
|
||||
TODO
|
||||
"""
|
||||
assert(False)
|
37
libbmc/tools.py
Normal file
37
libbmc/tools.py
Normal file
@ -0,0 +1,37 @@
|
||||
"""
|
||||
This file contains various utility functions.
|
||||
"""
|
||||
|
||||
|
||||
def replaceAll(text, replace_dict):
|
||||
"""
|
||||
Replace multiple strings in a text.
|
||||
|
||||
:param text: Text to replace in.
|
||||
:param replace_dict: Dictionary mapping strings to replace with their \
|
||||
substitution.
|
||||
:returns: Text after replacements.
|
||||
"""
|
||||
for i, j in replace_dict.items():
|
||||
text = text.replace(i, j)
|
||||
return text
|
||||
|
||||
|
||||
def clean_whitespaces(text):
|
||||
"""
|
||||
Remove multiple whitespaces from text.
|
||||
|
||||
:param text: Text to remove multiple whitespaces from.
|
||||
:returns: A cleaned text.
|
||||
"""
|
||||
return ' '.join(text.strip().split())
|
||||
|
||||
|
||||
def remove_duplicates(some_list):
|
||||
"""
|
||||
Remove the duplicates from a list.
|
||||
|
||||
:param some_list: List to remove duplicates from.
|
||||
:returns: A list without duplicates.
|
||||
"""
|
||||
return list(set(some_list))
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
isbnlib==3.5.7
|
||||
requests==2.9.1
|
Loading…
x
Reference in New Issue
Block a user