arxiv_metadata/reference_fetcher/bbl.py

"""
This files contains all the functions to deal with bbl files.
"""
import math
import os
import requests
import subprocess

from . import doi
from . import regex
from . import tools


def clean_bibitem(bibitem):
    """
    Return a plaintext representation of the bibitem from the ``.bbl`` file.

    :param bibitem: The text content of the bibitem.
    :returns: A cleaned plaintext citation from the bibitem.
    """
    script_dir = os.path.dirname(os.path.abspath(__file__))
    output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,),
                                      "-s"],
                                     input=bibitem.encode("utf-8"))
    output = output.decode("utf-8")
    output = tools.clean_whitespaces(output)
    return output


def parse(bbl):
    """
    Parse a ``*.bbl`` file to get a clean list of plaintext citations.

    :param bbl: Either the path to the .bbl file or the content of a ``.bbl`` \
            file.
    :returns:  A list of cleaned plaintext citations.
    """
    # Handle path or content
    if os.path.isfile(bbl):
        with open(bbl, 'r') as fh:
            bbl_content = fh.read()
    else:
        bbl_content = bbl
    # Get a list of bibitems
    bibitems = regex.bibitems.split(bbl_content)[1:]
    bibitems = [regex.endthebibliography.sub("",
                                             i).strip() for i in bibitems]
    cleaned_bbl = []
    # Clean every bibitem
    for bibitem in bibitems:
        cleaned_bbl.append(clean_bibitem(bibitem))
    return cleaned_bbl


def get_dois(bbl_input):
    """
    Get the papers cited by the paper identified by the given DOI.

    :param bbl_input: Either the path to the .bbl file or the content of a \
            bbl file.

    :returns: A dict of cleaned plaintext citations and their associated doi.
    """
    cleaned_citations_with_URLs = parse(bbl_input)
    dois = {}
    cleaned_citations = []
    # Try to get the DOI directly from the citation
    for citation in cleaned_citations_with_URLs[:]:
        # Get all the urls in the citation
        raw_urls = regex.urls.findall(citation)
        urls = [u.lower() for u in raw_urls]
        # Remove URLs in citation
        for url in raw_urls:
            citation = citation.replace(url, "")
        citation = tools.clean_whitespaces(citation)
        # Try to find an arXiv link
        arxiv_url = doi.extract_arxiv_links(urls)
        if arxiv_url:
            dois[citation] = arxiv_url
        # Try to find a DOI link
        doi_url = doi.extract_doi_links(urls)
        if doi_url:
            dois[citation] = doi_url
        # Try to find a direct match using a regex if links search failed
        if not doi_url and not arxiv_url:
            regex.match = doi.match_doi_or_arxiv(citation)
            if regex.match:
                print(regex.match)
                citation = citation.replace(regex.match[1], "")
                if regex.match[0] == "DOI":
                    dois[citation] = "http://dx.doi.org/%s" % (regex.match[1],)
                else:
                    dois[citation] = (
                        "http://arxiv.org/abs/%s" %
                        (regex.match[1].replace("arxiv:", ""),)
                    )
        # If no match found, stack it for next step
        if citation not in dois:
            cleaned_citations.append(citation)
    # Do batch of 10 papers, to prevent from the timeout of crossref
    for i in range(math.ceil(len(cleaned_citations) / 10)):
        lower_bound = 10 * i
        upper_bound = min(10 * (i + 1), len(cleaned_citations))
        r = requests.post("http://search.crossref.org/links",
                          json=cleaned_citations[lower_bound:upper_bound])
        for result in r.json()["results"]:
            if "doi" not in result:
                # If DOI is not found, try a direct query to get a DOI
                # r = requests.get("http://search.crossref.org/dois",
                #                  params={
                #                      'q': result["text"],
                #                      "sort": "score",
                #                      "rows": 1
                #                  })
                # doi_result = r.json()
                # if len(doi_result) > 0:
                #     dois[result["text"]] = doi_result[0]["doi"]
                # else:
                #     dois[result["text"]] = None
                dois[result["text"]] = None
            else:
                dois[result["text"]] = result["doi"]
    return dois
Add a function to look for an OA version for a given DOI 2015-12-23 23:03:40 +01:00			`"""`
			`This files contains all the functions to deal with bbl files.`
			`"""`
Clean a bit the code 2015-12-23 22:49:14 +01:00			`import math`
			`import os`
			`import requests`
			`import subprocess`

Move reference fetching code into a sub-library 2015-12-23 23:46:37 +01:00			`from . import doi`
			`from . import regex`
			`from . import tools`
Clean a bit the code 2015-12-23 22:49:14 +01:00

			`def clean_bibitem(bibitem):`
			`"""`
Update doc to rst format for generation with Sphinx 2015-12-25 01:43:49 +01:00			Return a plaintext representation of the bibitem from the ``.bbl`` file.
Clean a bit the code 2015-12-23 22:49:14 +01:00
Update doc to rst format for generation with Sphinx 2015-12-25 01:43:49 +01:00			`:param bibitem: The text content of the bibitem.`
			`:returns: A cleaned plaintext citation from the bibitem.`
Clean a bit the code 2015-12-23 22:49:14 +01:00			`"""`
			`script_dir = os.path.dirname(os.path.abspath(__file__))`
			`output = subprocess.check_output(["%s/opendetex/delatex" % (script_dir,),`
			`"-s"],`
			`input=bibitem.encode("utf-8"))`
			`output = output.decode("utf-8")`
			`output = tools.clean_whitespaces(output)`
			`return output`


			`def parse(bbl):`
			`"""`
Update doc to rst format for generation with Sphinx 2015-12-25 01:43:49 +01:00			Parse a ``*.bbl`` file to get a clean list of plaintext citations.
Clean a bit the code 2015-12-23 22:49:14 +01:00
Update doc to rst format for generation with Sphinx 2015-12-25 01:43:49 +01:00			:param bbl: Either the path to the .bbl file or the content of a ``.bbl`` \
			`file.`
			`:returns: A list of cleaned plaintext citations.`
Clean a bit the code 2015-12-23 22:49:14 +01:00			`"""`
			`# Handle path or content`
			`if os.path.isfile(bbl):`
			`with open(bbl, 'r') as fh:`
			`bbl_content = fh.read()`
			`else:`
			`bbl_content = bbl`
			`# Get a list of bibitems`
			`bibitems = regex.bibitems.split(bbl_content)[1:]`
			`bibitems = [regex.endthebibliography.sub("",`
			`i).strip() for i in bibitems]`
			`cleaned_bbl = []`
			`# Clean every bibitem`
			`for bibitem in bibitems:`
			`cleaned_bbl.append(clean_bibitem(bibitem))`
			`return cleaned_bbl`


			`def get_dois(bbl_input):`
			`"""`
			`Get the papers cited by the paper identified by the given DOI.`

Update doc to rst format for generation with Sphinx 2015-12-25 01:43:49 +01:00			`:param bbl_input: Either the path to the .bbl file or the content of a \`
			`bbl file.`
Clean a bit the code 2015-12-23 22:49:14 +01:00
Update doc to rst format for generation with Sphinx 2015-12-25 01:43:49 +01:00			`:returns: A dict of cleaned plaintext citations and their associated doi.`
Clean a bit the code 2015-12-23 22:49:14 +01:00			`"""`
			`cleaned_citations_with_URLs = parse(bbl_input)`
			`dois = {}`
			`cleaned_citations = []`
			`# Try to get the DOI directly from the citation`
			`for citation in cleaned_citations_with_URLs[:]:`
			`# Get all the urls in the citation`
			`raw_urls = regex.urls.findall(citation)`
			`urls = [u.lower() for u in raw_urls]`
			`# Remove URLs in citation`
			`for url in raw_urls:`
			`citation = citation.replace(url, "")`
			`citation = tools.clean_whitespaces(citation)`
			`# Try to find an arXiv link`
			`arxiv_url = doi.extract_arxiv_links(urls)`
			`if arxiv_url:`
			`dois[citation] = arxiv_url`
			`# Try to find a DOI link`
			`doi_url = doi.extract_doi_links(urls)`
			`if doi_url:`
			`dois[citation] = doi_url`
			`# Try to find a direct match using a regex if links search failed`
			`if not doi_url and not arxiv_url:`
			`regex.match = doi.match_doi_or_arxiv(citation)`
			`if regex.match:`
			`print(regex.match)`
			`citation = citation.replace(regex.match[1], "")`
			`if regex.match[0] == "DOI":`
			`dois[citation] = "http://dx.doi.org/%s" % (regex.match[1],)`
			`else:`
			`dois[citation] = (`
			`"http://arxiv.org/abs/%s" %`
			`(regex.match[1].replace("arxiv:", ""),)`
			`)`
			`# If no match found, stack it for next step`
			`if citation not in dois:`
			`cleaned_citations.append(citation)`
			`# Do batch of 10 papers, to prevent from the timeout of crossref`
			`for i in range(math.ceil(len(cleaned_citations) / 10)):`
			`lower_bound = 10 * i`
			`upper_bound = min(10 * (i + 1), len(cleaned_citations))`
			`r = requests.post("http://search.crossref.org/links",`
			`json=cleaned_citations[lower_bound:upper_bound])`
			`for result in r.json()["results"]:`
			`if "doi" not in result:`
			`# If DOI is not found, try a direct query to get a DOI`
			`# r = requests.get("http://search.crossref.org/dois",`
			`# params={`
			`# 'q': result["text"],`
			`# "sort": "score",`
			`# "rows": 1`
			`# })`
			`# doi_result = r.json()`
			`# if len(doi_result) > 0:`
			`# dois[result["text"]] = doi_result[0]["doi"]`
			`# else:`
			`# dois[result["text"]] = None`
			`dois[result["text"]] = None`
			`else:`
			`dois[result["text"]] = result["doi"]`
			`return dois`