Reimport bbl citations parsing and make some minor fixes
This commit is contained in:
parent
97eb5a3ae0
commit
d8b74ae356
125
libbmc/citations/bbl.py
Normal file
125
libbmc/citations/bbl.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
"""
|
||||||
|
This files contains all the functions to deal with .bbl files.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
from requests.exception import RequestException
|
||||||
|
|
||||||
|
from libbmc import doi
|
||||||
|
from libbmc import tools
|
||||||
|
from libbmc.repositories import arxiv
|
||||||
|
|
||||||
|
|
||||||
|
# Regex to match bibitems
|
||||||
|
BIBITEMS_REGEX = re.compile(r"\\bibitem\{.+?\}")
|
||||||
|
# Regex to match end of bibliography
|
||||||
|
ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*")
|
||||||
|
|
||||||
|
|
||||||
|
# CrossRef API URL
|
||||||
|
CROSSREF_LINKS_API_URL = "http://search.crossref.org/links"
|
||||||
|
CROSSREF_MAX_BATCH_SIZE = 10
|
||||||
|
|
||||||
|
|
||||||
|
def bibitem_as_plaintext(bibitem):
|
||||||
|
"""
|
||||||
|
Return a plaintext representation of the bibitem from the ``.bbl`` file.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This plaintext representation can be super ugly, contain URLs and so \
|
||||||
|
on.
|
||||||
|
|
||||||
|
:param bibitem: The text content of the bibitem.
|
||||||
|
:returns: A cleaned plaintext citation from the bibitem.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
output = subprocess.check_output(["delatex",
|
||||||
|
"-s"],
|
||||||
|
input=bibitem.encode("utf-8"))
|
||||||
|
except FileNotFoundError:
|
||||||
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
output = subprocess.check_output(["%s/../external/opendetex/delatex" %
|
||||||
|
(script_dir,),
|
||||||
|
"-s"],
|
||||||
|
input=bibitem.encode("utf-8"))
|
||||||
|
output = output.decode("utf-8")
|
||||||
|
output = tools.clean_whitespaces(output)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def get_plaintext_citations(bbl):
|
||||||
|
"""
|
||||||
|
Parse a ``*.bbl`` file to get a clean list of plaintext citations.
|
||||||
|
|
||||||
|
:param bbl: Either the path to the .bbl file or the content of a ``.bbl`` \
|
||||||
|
file.
|
||||||
|
:returns: A list of cleaned plaintext citations.
|
||||||
|
"""
|
||||||
|
# Handle path or content
|
||||||
|
if os.path.isfile(bbl):
|
||||||
|
with open(bbl, 'r') as fh:
|
||||||
|
bbl_content = fh.read()
|
||||||
|
else:
|
||||||
|
bbl_content = bbl
|
||||||
|
# Get a list of bibitems, taking the first item out as it is *before* the
|
||||||
|
# first \bibitem
|
||||||
|
bibitems = BIBITEMS_REGEX.split(bbl_content)[1:]
|
||||||
|
# Delete the text after the \end{thebibliography}
|
||||||
|
bibitems = [ENDTHEBIBLIOGRAPHY_REGEX.sub("", i).strip() for i in bibitems]
|
||||||
|
# Clean every bibitem to have plaintext
|
||||||
|
cleaned_bbl = [bibitem_as_plaintext(bibitem) for bibitem in bibitems]
|
||||||
|
return cleaned_bbl
|
||||||
|
|
||||||
|
|
||||||
|
def get_cited_DOIs(bbl):
|
||||||
|
"""
|
||||||
|
Get the DOIs of the papers cited in this .bbl file.
|
||||||
|
|
||||||
|
:param bbl: Either the path to a .bbl file or the content of a .bbl file.
|
||||||
|
|
||||||
|
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
||||||
|
"""
|
||||||
|
dois = {}
|
||||||
|
crossref_queue = []
|
||||||
|
# Get the plaintext citations from the bbl file
|
||||||
|
plaintext_citations = get_plaintext_citations(bbl)
|
||||||
|
# Try to get the DOI directly from the citation
|
||||||
|
for citation in plaintext_citations[:]:
|
||||||
|
# Some citations already contain a DOI so try to match it directly
|
||||||
|
matched_DOIs = doi.extract_from_text(citation)
|
||||||
|
if matched_DOIs is not None:
|
||||||
|
# Add the DOI and go on
|
||||||
|
dois[citation] = matched_DOIs[0]
|
||||||
|
continue
|
||||||
|
# Same thing for arXiv id
|
||||||
|
matched_arXiv = arxiv.extract_from_text(citation)
|
||||||
|
if matched_arXiv is not None:
|
||||||
|
# Add the associated DOI and go on
|
||||||
|
dois[citation] = arxiv.to_DOI(matched_arXiv[0])
|
||||||
|
continue
|
||||||
|
# If no match found, stack it for next step
|
||||||
|
# Note to remove URLs in the citation as the plaintext citations can
|
||||||
|
# contain URLs and they are bad for the CrossRef API.
|
||||||
|
crossref_queue.append(tools.remove_URLs(citation))
|
||||||
|
# Do batch of papers, to prevent from the timeout of crossref
|
||||||
|
for batch in tools.batch(crossref_queue, CROSSREF_MAX_BATCH_SIZE):
|
||||||
|
try:
|
||||||
|
# Fetch results from CrossRef
|
||||||
|
r = requests.post(CROSSREF_LINKS_API_URL, json=batch)
|
||||||
|
for result in r.json()["results"]:
|
||||||
|
# Try to get a DOI
|
||||||
|
try:
|
||||||
|
dois[result["text"]] = result["doi"]
|
||||||
|
except KeyError:
|
||||||
|
# Or set it to None
|
||||||
|
dois[result["text"]] = None
|
||||||
|
except (RequestException, ValueError, KeyError):
|
||||||
|
# If an exception occurred, set all the DOIs to None for the
|
||||||
|
# current batch
|
||||||
|
for i in batch:
|
||||||
|
dois[i] = None
|
||||||
|
return dois
|
@ -4,14 +4,16 @@ This file contains all the DOI-related functions.
|
|||||||
import re
|
import re
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from requests.exception import RequestException
|
||||||
|
|
||||||
from libbmc import tools
|
from libbmc import tools
|
||||||
|
|
||||||
# Taken from
|
# Taken from
|
||||||
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
|
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page/10324802#10324802
|
||||||
regex = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b",
|
REGEX = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b",
|
||||||
re.IGNORECASE)
|
re.IGNORECASE)
|
||||||
# Base dx.doi.org URL for redirections
|
# Base dx.doi.org URL for redirections
|
||||||
dx_url = "http://dx.doi.org/{doi}"
|
DX_URL = "http://dx.doi.org/{doi}"
|
||||||
|
|
||||||
|
|
||||||
def is_valid(doi):
|
def is_valid(doi):
|
||||||
@ -21,7 +23,7 @@ def is_valid(doi):
|
|||||||
:param doi: The DOI to be checked.
|
:param doi: The DOI to be checked.
|
||||||
:returns: Boolean indicating whether the DOI is valid or not.
|
:returns: Boolean indicating whether the DOI is valid or not.
|
||||||
"""
|
"""
|
||||||
match = regex.match(doi)
|
match = REGEX.match(doi)
|
||||||
return ((match is not None) and (match.group(0) == doi))
|
return ((match is not None) and (match.group(0) == doi))
|
||||||
|
|
||||||
|
|
||||||
@ -32,7 +34,7 @@ def extract_from_text(text):
|
|||||||
:param text: The text to extract DOIs from.
|
:param text: The text to extract DOIs from.
|
||||||
:returns: A list of found DOIs.
|
:returns: A list of found DOIs.
|
||||||
"""
|
"""
|
||||||
return tools.remove_duplicates(regex.findall(text))
|
return tools.remove_duplicates(REGEX.findall(text))
|
||||||
|
|
||||||
|
|
||||||
def to_URL(dois):
|
def to_URL(dois):
|
||||||
@ -43,9 +45,9 @@ def to_URL(dois):
|
|||||||
:returns: A list of DOIs URLs.
|
:returns: A list of DOIs URLs.
|
||||||
"""
|
"""
|
||||||
if isinstance(dois, list):
|
if isinstance(dois, list):
|
||||||
return [dx_url.format(doi=doi) for doi in dois]
|
return [DX_URL.format(doi=doi) for doi in dois]
|
||||||
else:
|
else:
|
||||||
return dx_url.format(doi=dois)
|
return DX_URL.format(doi=dois)
|
||||||
|
|
||||||
|
|
||||||
def to_canonical(urls):
|
def to_canonical(urls):
|
||||||
@ -73,13 +75,13 @@ def get_oa_version(doi):
|
|||||||
:returns: The URL of the OA version of the given DOI, or ``None``.
|
:returns: The URL of the OA version of the given DOI, or ``None``.
|
||||||
"""
|
"""
|
||||||
# If DOI is a link, truncate it
|
# If DOI is a link, truncate it
|
||||||
r = requests.get("http://beta.dissem.in/api/%s" % (doi,))
|
|
||||||
try:
|
try:
|
||||||
assert(r.status_code == requests.codes.ok)
|
r = requests.get("http://beta.dissem.in/api/%s" % (doi,))
|
||||||
|
r.raise_for_status()
|
||||||
result = r.json()
|
result = r.json()
|
||||||
assert(result["status"] == "ok")
|
assert(result["status"] == "ok")
|
||||||
return result["paper"]["pdf_url"]
|
return result["paper"]["pdf_url"]
|
||||||
except (AssertionError, ValueError, KeyError):
|
except (AssertionError, ValueError, KeyError, RequestException):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@ -90,8 +92,11 @@ def get_linked_version(doi):
|
|||||||
:param doi: A canonical DOI.
|
:param doi: A canonical DOI.
|
||||||
:returns: The canonical URL behind the DOI, or ``None``.
|
:returns: The canonical URL behind the DOI, or ``None``.
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
r = requests.head(to_URL(doi))
|
r = requests.head(to_URL(doi))
|
||||||
return r.headers.get("location")
|
return r.headers.get("location")
|
||||||
|
except RequestException:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_bibtex(doi):
|
def get_bibtex(doi):
|
||||||
@ -105,9 +110,11 @@ def get_bibtex(doi):
|
|||||||
:param doi: The canonical DOI to get BibTeX from.
|
:param doi: The canonical DOI to get BibTeX from.
|
||||||
:returns: A BibTeX string or ``None``.
|
:returns: A BibTeX string or ``None``.
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
r = requests.get(to_URL(doi),
|
r = requests.get(to_URL(doi),
|
||||||
headers={"accept": "application/x-bibtex"})
|
headers={"accept": "application/x-bibtex"})
|
||||||
if r.headers.get("content-type") == "application/x-bibtex":
|
r.raise_for_status()
|
||||||
|
assert(r.headers.get("content-type") == "application/x-bibtex")
|
||||||
return r.text
|
return r.text
|
||||||
else:
|
except (RequestException, AssertionError):
|
||||||
return None
|
return None
|
||||||
|
@ -35,10 +35,10 @@ def get_bibtex(isbn):
|
|||||||
:param isbn: ISBN to fetch BibTeX entry for.
|
:param isbn: ISBN to fetch BibTeX entry for.
|
||||||
:returns: A BibTeX string.
|
:returns: A BibTeX string.
|
||||||
"""
|
"""
|
||||||
return doi.get_bibtex(to_doi(isbn))
|
return doi.get_bibtex(to_DOI(isbn))
|
||||||
|
|
||||||
|
|
||||||
def to_doi(isbn):
|
def to_DOI(isbn):
|
||||||
"""
|
"""
|
||||||
Try to fetch a DOI from a given ISBN.
|
Try to fetch a DOI from a given ISBN.
|
||||||
|
|
||||||
|
@ -1,17 +1,23 @@
|
|||||||
"""
|
"""
|
||||||
This file contains all the arXiv-related functions.
|
This file contains all the arXiv-related functions.
|
||||||
"""
|
"""
|
||||||
|
import arxiv2bib
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
import requests
|
import requests
|
||||||
import tarfile
|
import tarfile
|
||||||
import xml.etree.ElementTree
|
import xml.etree.ElementTree
|
||||||
|
|
||||||
|
from urllib.error import HTTPError
|
||||||
|
from requests.exception import RequestException
|
||||||
|
|
||||||
|
|
||||||
from libbmc import tools
|
from libbmc import tools
|
||||||
|
from libbmc.citations import bbl
|
||||||
|
|
||||||
|
|
||||||
arxiv_identifier_from_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
|
ARXIV_IDENTIFIER_FROM_2007 = r"\d{4}\.\d{4,5}(v\d+)?"
|
||||||
arxiv_identifier_before_2007 = r"(" + ("|".join([
|
ARXIV_IDENTIFIER_BEFORE_2007 = r"(" + ("|".join([
|
||||||
"astro-ph.GA",
|
"astro-ph.GA",
|
||||||
"astro-ph.CO",
|
"astro-ph.CO",
|
||||||
"astro-ph.EP",
|
"astro-ph.EP",
|
||||||
@ -159,15 +165,15 @@ arxiv_identifier_before_2007 = r"(" + ("|".join([
|
|||||||
"stat.ME",
|
"stat.ME",
|
||||||
"stat.OT",
|
"stat.OT",
|
||||||
"stat.TH"])) + r")/\d+"
|
"stat.TH"])) + r")/\d+"
|
||||||
regex = re.compile(
|
REGEX = re.compile(
|
||||||
"(" + arxiv_identifier_from_2007 + ")|(" +
|
"(" + ARXIV_IDENTIFIER_FROM_2007 + ")|(" +
|
||||||
arxiv_identifier_before_2007 + ")",
|
ARXIV_IDENTIFIER_BEFORE_2007 + ")",
|
||||||
re.IGNORECASE)
|
re.IGNORECASE)
|
||||||
|
|
||||||
# Base arXiv URL used as id sometimes
|
# Base arXiv URL used as id sometimes
|
||||||
arxiv_url = "http://arxiv.org/abs/{arxiv_id}"
|
ARXIV_URL = "http://arxiv.org/abs/{arxiv_id}"
|
||||||
# Eprint URL used to download sources
|
# Eprint URL used to download sources
|
||||||
arxiv_eprint_url = "http://arxiv.org/e-print/{arxiv_id}"
|
ARXIV_EPRINT_URL = "http://arxiv.org/e-print/{arxiv_id}"
|
||||||
|
|
||||||
|
|
||||||
def is_valid(arxiv_id):
|
def is_valid(arxiv_id):
|
||||||
@ -177,15 +183,35 @@ def is_valid(arxiv_id):
|
|||||||
:param arxiv_id: The arXiv ID to be checked.
|
:param arxiv_id: The arXiv ID to be checked.
|
||||||
:returns: Boolean indicating whether the arXiv ID is valid or not.
|
:returns: Boolean indicating whether the arXiv ID is valid or not.
|
||||||
"""
|
"""
|
||||||
match = regex.match(arxiv_id)
|
match = REGEX.match(arxiv_id)
|
||||||
return ((match is not None) and (match.group(0) == arxiv_id))
|
return ((match is not None) and (match.group(0) == arxiv_id))
|
||||||
|
|
||||||
|
|
||||||
def get_bibtex(arxiv_id):
|
def get_bibtex(arxiv_id):
|
||||||
"""
|
"""
|
||||||
TODO
|
Get a BibTeX entry for a given DOI.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Using awesome https://pypi.python.org/pypi/arxiv2bib/ module.
|
||||||
|
|
||||||
|
:param arxiv_id: The canonical arXiv id to get BibTeX from.
|
||||||
|
:returns: A BibTeX string or ``None``.
|
||||||
"""
|
"""
|
||||||
assert(False)
|
# Fetch bibtex using arxiv2bib module
|
||||||
|
try:
|
||||||
|
bibtex = arxiv2bib.arxiv2bib([arxiv_id])
|
||||||
|
except HTTPError:
|
||||||
|
bibtex = []
|
||||||
|
|
||||||
|
for bib in bibtex:
|
||||||
|
if isinstance(bib, arxiv2bib.ReferenceErrorInfo):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Return fetched bibtex
|
||||||
|
return bib.bibtex()
|
||||||
|
# An error occurred, return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def extract_from_text(text):
|
def extract_from_text(text):
|
||||||
@ -195,7 +221,7 @@ def extract_from_text(text):
|
|||||||
:param text: The text to extract arXiv IDs from.
|
:param text: The text to extract arXiv IDs from.
|
||||||
:returns: A list of matching arXiv IDs.
|
:returns: A list of matching arXiv IDs.
|
||||||
"""
|
"""
|
||||||
return tools.remove_duplicates(regex.findall(text))
|
return tools.remove_duplicates(REGEX.findall(text))
|
||||||
|
|
||||||
|
|
||||||
def to_URL(arxiv_ids):
|
def to_URL(arxiv_ids):
|
||||||
@ -206,9 +232,9 @@ def to_URL(arxiv_ids):
|
|||||||
:returns: A list of DOIs URLs.
|
:returns: A list of DOIs URLs.
|
||||||
"""
|
"""
|
||||||
if isinstance(arxiv_ids, list):
|
if isinstance(arxiv_ids, list):
|
||||||
return [arxiv_url.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]
|
return [ARXIV_URL.format(arxiv_id=arxiv_id) for arxiv_id in arxiv_ids]
|
||||||
else:
|
else:
|
||||||
return arxiv_url.format(arxiv_id=arxiv_ids)
|
return ARXIV_URL.format(arxiv_id=arxiv_ids)
|
||||||
|
|
||||||
|
|
||||||
def to_canonical(urls):
|
def to_canonical(urls):
|
||||||
@ -236,11 +262,15 @@ def from_doi(doi):
|
|||||||
:param doi: The DOI of the resource to look for.
|
:param doi: The DOI of the resource to look for.
|
||||||
:returns: The arXiv eprint id, or ``None`` if not found.
|
:returns: The arXiv eprint id, or ``None`` if not found.
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
r = requests.get("http://export.arxiv.org/api/query",
|
r = requests.get("http://export.arxiv.org/api/query",
|
||||||
params={
|
params={
|
||||||
"search_query": "doi:%s" % (doi,),
|
"search_query": "doi:%s" % (doi,),
|
||||||
"max_results": 1
|
"max_results": 1
|
||||||
})
|
})
|
||||||
|
r.raise_for_status()
|
||||||
|
except RequestException:
|
||||||
|
return None
|
||||||
e = xml.etree.ElementTree.fromstring(r.content)
|
e = xml.etree.ElementTree.fromstring(r.content)
|
||||||
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
||||||
id = entry.find("{http://www.w3.org/2005/Atom}id").text
|
id = entry.find("{http://www.w3.org/2005/Atom}id").text
|
||||||
@ -250,7 +280,7 @@ def from_doi(doi):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def to_doi(arxiv_id):
|
def to_DOI(arxiv_id):
|
||||||
"""
|
"""
|
||||||
Get the associated DOI for a given arXiv eprint.
|
Get the associated DOI for a given arXiv eprint.
|
||||||
|
|
||||||
@ -262,11 +292,15 @@ def to_doi(arxiv_id):
|
|||||||
:param eprint: The arXiv eprint id.
|
:param eprint: The arXiv eprint id.
|
||||||
:returns: The DOI if any, or ``None``.
|
:returns: The DOI if any, or ``None``.
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
r = requests.get("http://export.arxiv.org/api/query",
|
r = requests.get("http://export.arxiv.org/api/query",
|
||||||
params={
|
params={
|
||||||
"id_list": arxiv_id,
|
"id_list": arxiv_id,
|
||||||
"max_results": 1
|
"max_results": 1
|
||||||
})
|
})
|
||||||
|
r.raise_for_status()
|
||||||
|
except RequestException:
|
||||||
|
return None
|
||||||
e = xml.etree.ElementTree.fromstring(r.content)
|
e = xml.etree.ElementTree.fromstring(r.content)
|
||||||
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
for entry in e.iter("{http://www.w3.org/2005/Atom}entry"):
|
||||||
doi = entry.find("{http://arxiv.org/schemas/atom}doi")
|
doi = entry.find("{http://arxiv.org/schemas/atom}doi")
|
||||||
@ -284,12 +318,12 @@ def get_sources(arxiv_id):
|
|||||||
:returns: A ``TarFile`` object of the sources of the arXiv preprint or \
|
:returns: A ``TarFile`` object of the sources of the arXiv preprint or \
|
||||||
``None``.
|
``None``.
|
||||||
"""
|
"""
|
||||||
r = requests.get(arxiv_eprint_url.format(arxiv_id=arxiv_id))
|
|
||||||
try:
|
try:
|
||||||
assert(r.status_code == requests.codes.ok)
|
r = requests.get(ARXIV_EPRINT_URL.format(arxiv_id=arxiv_id))
|
||||||
|
r.raise_for_status()
|
||||||
file_object = io.BytesIO(r.content)
|
file_object = io.BytesIO(r.content)
|
||||||
return tarfile.open(fileobj=file_object)
|
return tarfile.open(fileobj=file_object)
|
||||||
except (AssertionError, tarfile.TarError):
|
except (RequestException, AssertionError, tarfile.TarError):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@ -297,8 +331,8 @@ def get_bbl(arxiv_id):
|
|||||||
"""
|
"""
|
||||||
Get the .bbl files (if any) of a given preprint.
|
Get the .bbl files (if any) of a given preprint.
|
||||||
|
|
||||||
:param eprint: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in a \
|
:param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
|
||||||
canonical form.
|
a canonical form.
|
||||||
:returns: A list of the full text of the ``.bbl`` files (if any) \
|
:returns: A list of the full text of the ``.bbl`` files (if any) \
|
||||||
or ``None``.
|
or ``None``.
|
||||||
"""
|
"""
|
||||||
@ -311,6 +345,16 @@ def get_bbl(arxiv_id):
|
|||||||
|
|
||||||
def get_citations(arxiv_id):
|
def get_citations(arxiv_id):
|
||||||
"""
|
"""
|
||||||
TODO
|
Get the DOIs cited by a given preprint.
|
||||||
|
|
||||||
|
:param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \
|
||||||
|
a canonical form.
|
||||||
|
:returns: A dict of cleaned plaintext citations and their associated DOI.
|
||||||
"""
|
"""
|
||||||
assert(False)
|
dois = {}
|
||||||
|
# Get the list of bbl files for this preprint
|
||||||
|
bbl_files = get_bbl(arxiv_id)
|
||||||
|
for bbl_file in bbl_files:
|
||||||
|
# Fetch the cited DOIs for each of the bbl files
|
||||||
|
dois.update(bbl.get_cited_DOIs(bbl_file))
|
||||||
|
return dois
|
||||||
|
1
libbmc/repositories/hal.py
Normal file
1
libbmc/repositories/hal.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
# TODO
|
@ -1,6 +1,12 @@
|
|||||||
"""
|
"""
|
||||||
This file contains various utility functions.
|
This file contains various utility functions.
|
||||||
"""
|
"""
|
||||||
|
import re
|
||||||
|
from itertools import islice, chain
|
||||||
|
|
||||||
|
|
||||||
|
# Huge URL regex taken from https://gist.github.com/gruber/8891611
|
||||||
|
URL_REGEX = re.compile(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))")
|
||||||
|
|
||||||
|
|
||||||
def replaceAll(text, replace_dict):
|
def replaceAll(text, replace_dict):
|
||||||
@ -19,7 +25,8 @@ def replaceAll(text, replace_dict):
|
|||||||
|
|
||||||
def clean_whitespaces(text):
|
def clean_whitespaces(text):
|
||||||
"""
|
"""
|
||||||
Remove multiple whitespaces from text.
|
Remove multiple whitespaces from text. Also removes leading and trailing \
|
||||||
|
whitespaces.
|
||||||
|
|
||||||
:param text: Text to remove multiple whitespaces from.
|
:param text: Text to remove multiple whitespaces from.
|
||||||
:returns: A cleaned text.
|
:returns: A cleaned text.
|
||||||
@ -35,3 +42,38 @@ def remove_duplicates(some_list):
|
|||||||
:returns: A list without duplicates.
|
:returns: A list without duplicates.
|
||||||
"""
|
"""
|
||||||
return list(set(some_list))
|
return list(set(some_list))
|
||||||
|
|
||||||
|
|
||||||
|
def batch(iterable, size):
|
||||||
|
"""
|
||||||
|
Get items from a sequence a batch at a time.
|
||||||
|
|
||||||
|
.. note:
|
||||||
|
|
||||||
|
Adapted from
|
||||||
|
https://code.activestate.com/recipes/303279-getting-items-in-batches/.
|
||||||
|
|
||||||
|
|
||||||
|
.. note:
|
||||||
|
|
||||||
|
All batches must be exhausted immediately.
|
||||||
|
|
||||||
|
:params iterable: An iterable to get batches from.
|
||||||
|
:params size: Size of the batches.
|
||||||
|
:returns: A new batch of the given size at each time.
|
||||||
|
"""
|
||||||
|
it = iter(iterable)
|
||||||
|
while True:
|
||||||
|
bi = islice(it, size)
|
||||||
|
yield chain([bi.next()], bi)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_URLs(text):
|
||||||
|
"""
|
||||||
|
Remove URLs from a given text (only removes http, https and naked domains \
|
||||||
|
URLs).
|
||||||
|
|
||||||
|
:param text: The text to remove URLs from.
|
||||||
|
:returns: The text without URLs.
|
||||||
|
"""
|
||||||
|
return clean_whitespaces(URL_REGEX.sub("", text))
|
||||||
|
@ -1,2 +1,3 @@
|
|||||||
isbnlib==3.5.7
|
arxiv2bib>=1.0.7
|
||||||
requests==2.9.1
|
isbnlib>=3.5.7
|
||||||
|
requests>=2.9.1
|
||||||
|
Loading…
Reference in New Issue
Block a user