bmc/modules/papers.py

"""
Fetches papers.
"""
import re
import os
import json
import random
import requests
import lxml.etree
from StringIO import StringIO

def download(phenny, input, verbose=True):
    """
    Downloads a paper.
    """
    # only accept requests in a channel
    if not input.sender.startswith('#'):
        # unless the user is an admin, of course
        if not input.admin:
            phenny.say("i only take requests in the ##hplusroadmap channel.")
            return
        else:
            # just give a warning message to the admin.. not a big deal.
            phenny.say("okay i'll try, but please send me requests in ##hplusroadmap in the future.")

    # get the input
    line = input.group()

    # was this an explicit command?
    explicit = False
    if line.startswith(phenny.nick):
        explicit = True
        line = line[len(phenny.nick):]

        if line.startswith(",") or line.startswith(":"):
            line = line[1:]

    if line.startswith(" "):
        line = line.strip()

    # don't bother if there's nothing there
    if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
        return
    for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
        line = filter_fix(line)

        # fix for login.jsp links to ieee xplore
        line = fix_ieee_login_urls(line)

        translation_url = "http://localhost:1969/web"

        headers = {
            "Content-Type": "application/json",
        }

        data = {
            "url": line,
            "sessionid": "what"
        }

        data = json.dumps(data)

        response = requests.post(translation_url, data=data, headers=headers)

        if response.status_code == 200:
            # see if there are any attachments
            content = json.loads(response.content)
            item = content[0]
            title = item["title"]

            if item.has_key("attachments"):
                pdf_url = None
                for attachment in item["attachments"]:
                    if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
                        pdf_url = attachment["url"]
                        break

                if pdf_url:
                    user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"

                    headers = {
                        "User-Agent": user_agent,
                    }

                    response = None
                    if pdf_url.startswith("https://"):
                        response = requests.get(pdf_url, headers=headers, verify=False)
                    else:
                        response = requests.get(pdf_url, headers=headers)

                    # detect failure
                    if response.status_code == 401:
                        phenny.say("HTTP 401 unauthorized " + str(pdf_url))
                        continue
                    elif response.status_code != 200:
                        phenny.say("HTTP " + str(response.status_code) + " " + str(pdf_url))
                        continue

                    data = response.content

                    # grr..
                    title = title.encode("ascii", "ignore")

                    path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + ".pdf")

                    file_handler = open(path, "w")
                    file_handler.write(data)
                    file_handler.close()

                    filename = requests.utils.quote(title)
                    url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf"

                    phenny.say(url)
                    continue
                elif verbose and explicit:
                    phenny.say(download_url(line))
                    continue
            elif verbose and explicit:
                phenny.say(download_url(line))
                continue
        elif verbose and explicit:
            if response.status_code == 501:
                if verbose:
                    phenny.say("no translator available, raw dump: " + download_url(line))
                    continue
            else:
                if verbose:
                    phenny.say("error: HTTP " + str(response.status_code) + " " + download_url(line))
                    continue
        else:
            continue
    return
download.commands = ["fetch", "get", "download"]
download.priority = "high"
download.rule = r'(.*)'

def download_ieee(url):
    """
    Downloads an IEEE paper. The Zotero translator requires frames/windows to
    be available. Eventually translation-server will be fixed, but until then
    it might be nice to have an IEEE workaround.
    """
    # url = "http://ieeexplore.ieee.org:80/xpl/freeabs_all.jsp?reload=true&arnumber=901261"
    # url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261"
    raise NotImplementedError

def download_url(url):
    response = requests.get(url, headers={"User-Agent": "origami-pdf"})
    content = response.content

    # just make up a default filename
    title = "%0.2x" % random.getrandbits(128)

    # default extension
    extension = ".txt"

    if "pdf" in response.headers["content-type"]:
        extension = ".pdf"
    elif check_if_html(response):
        # parse the html string with lxml.etree
        tree = parse_html(content)

        # extract some metadata with xpaths
        citation_pdf_url = find_citation_pdf_url(tree, url)
        citation_title = find_citation_title(tree)

        # wow, this seriously needs to be cleaned up
        if citation_pdf_url and citation_title:
            citation_title = citation_title.encode("ascii", "ignore")
            response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
            content = response.content
            if "pdf" in response.headers["content-type"]:
                extension = ".pdf"
                title = citation_title
        else:
            if "sciencedirect.com" in url and not "ShoppingCart" in url:
                try:
                    title = tree.xpath("//h1[@class='svTitle']")[0].text
                    pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
                    new_response = requests.get(pdf_url, headers={"User-Agent": "sdf-macross"})
                    new_content = new_response.content
                    if "pdf" in new_response.headers["content-type"]:
                        extension = ".pdf"
                except Exception:
                    pass
                else:
                    content = new_content
                    response = new_response
            elif "apl.aip.org" in url:
                try:
                    title = tree.xpath("//title/text()")[0].split(" | ")[0]
                    pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]
                    new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
                    new_content = new_response.content
                    if "pdf" in new_response.headers["content-type"]:
                        extension = ".pdf"
                except Exception:
                    pass
                else:
                    content = new_content
                    response = new_response
            elif "h1 class=\"articleTitle" in content:
                try:
                    title = tree.xpath("//h1[@class='articleTitle']")[0].text
                    title = title.encode("ascii", "ignore")
                    pdf_url = tree.xpath("//a[@title='View the Full Text PDF']/@href")[0]
                except:
                    pass
                else:
                    if pdf_url.startswith("/"):
                        url_start = url[:url.find("/",8)]
                        pdf_url = url_start + pdf_url
                    response = requests.get(pdf_url, headers={"User-Agent": "pdf-teapot"})
                    content = response.content
                    if "pdf" in response.headers["content-type"]:
                        extension = ".pdf"
            # raise Exception("problem with citation_pdf_url or citation_title")
            # well, at least save the contents from the original url
            pass

    # can't create directories
    title = title.replace("/", "_")

    path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + extension)

    file_handler = open(path, "w")
    file_handler.write(content)
    file_handler.close()

    url = "http://diyhpl.us/~bryan/papers2/paperbot/" + requests.utils.quote(title) + extension

    return url

def parse_html(content):
    if not isinstance(content, StringIO):
        content = StringIO(content)
    parser = lxml.etree.HTMLParser()
    tree = lxml.etree.parse(content, parser)
    return tree

def check_if_html(response):
    return "text/html" in response.headers["content-type"]

def find_citation_pdf_url(tree, url):
    """
    Returns the <meta name="citation_pdf_url"> content attribute.
    """
    citation_pdf_url = extract_meta_content(tree, "citation_pdf_url")
    if citation_pdf_url and  not citation_pdf_url.startswith("http"):
        if citation_pdf_url.startswith("/"):
            url_start = url[:url.find("/",8)]
            citation_pdf_url = url_start + citation_pdf_url
        else:
            raise Exception("unhandled situation (citation_pdf_url)")
    return citation_pdf_url

def find_citation_title(tree):
    """
    Returns the <meta name="citation_title"> content attribute.
    """
    citation_title = extract_meta_content(tree, "citation_title")
    return citation_title

def extract_meta_content(tree, meta_name):
    try:
        content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0]
    except:
        return None
    else:
        return content

def filter_fix(url):
    """
    Fixes some common problems in urls.
    """
    if ".proxy.lib.pdx.edu" in url:
        url = url.replace(".proxy.lib.pdx.edu", "")
    return url

def fix_ieee_login_urls(url):
    """
    Fixes urls point to login.jsp on IEEE Xplore. When someone browses to the
    abstracts page on IEEE Xplore, they are sometimes sent to the login.jsp
    page, and then this link is given to paperbot. The actual link is based on
    the arnumber.

    example:
    http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=806324&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D806324
    """
    if "ieeexplore.ieee.org/xpl/login.jsp" in url:
        if "arnumber=" in url:
            parts = url.split("arnumber=")

            # i guess the url might not look like the example in the docstring
            if "&" in parts[1]:
                arnumber = parts[1].split("&")[0]
            else:
                arnumber = parts[1]

            return "http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=" + arnumber

    # default case when things go wrong
    return url
initial commit 2013-01-08 07:27:46 +01:00			`"""`
			`Fetches papers.`
			`"""`
attempt at multi URL downloads 2013-01-11 06:08:36 +01:00			`import re`
initial commit 2013-01-08 07:27:46 +01:00			`import os`
			`import json`
default to downloading the url 2013-01-10 19:16:55 +01:00			`import random`
initial commit 2013-01-08 07:27:46 +01:00			`import requests`
attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`import lxml.etree`
			`from StringIO import StringIO`
initial commit 2013-01-08 07:27:46 +01:00
			`def download(phenny, input, verbose=True):`
			`"""`
			`Downloads a paper.`
			`"""`
			`# only accept requests in a channel`
			`if not input.sender.startswith('#'):`
			`# unless the user is an admin, of course`
			`if not input.admin:`
			`phenny.say("i only take requests in the ##hplusroadmap channel.")`
			`return`
			`else:`
			`# just give a warning message to the admin.. not a big deal.`
			`phenny.say("okay i'll try, but please send me requests in ##hplusroadmap in the future.")`

			`# get the input`
			`line = input.group()`
Revert "added printline" This reverts commit 956fddff8a2ba7e4064e66c4dae7263d70feacec. 2013-01-11 02:09:36 +01:00
initial commit 2013-01-08 07:27:46 +01:00			`# was this an explicit command?`
			`explicit = False`
			`if line.startswith(phenny.nick):`
			`explicit = True`
			`line = line[len(phenny.nick):]`

			`if line.startswith(",") or line.startswith(":"):`
			`line = line[1:]`

			`if line.startswith(" "):`
			`line = line.strip()`

			`# don't bother if there's nothing there`
			`if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):`
			`return`
tabs to spaces 2013-01-11 06:14:16 +01:00			`for line in re.findall('http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):`
auto-remove proxy.lib.pdx.edu from urls 2013-01-20 02:33:17 +01:00			`line = filter_fix(line)`

handle ieee xplore login.jsp urls 2013-01-22 02:11:12 +01:00			`# fix for login.jsp links to ieee xplore`
			`line = fix_ieee_login_urls(line)`

tabs to spaces 2013-01-11 06:14:16 +01:00			`translation_url = "http://localhost:1969/web"`

			`headers = {`
			`"Content-Type": "application/json",`
			`}`

			`data = {`
			`"url": line,`
			`"sessionid": "what"`
			`}`

			`data = json.dumps(data)`

			`response = requests.post(translation_url, data=data, headers=headers)`

			`if response.status_code == 200:`
			`# see if there are any attachments`
			`content = json.loads(response.content)`
			`item = content[0]`
			`title = item["title"]`

			`if item.has_key("attachments"):`
			`pdf_url = None`
			`for attachment in item["attachments"]:`
			`if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:`
			`pdf_url = attachment["url"]`
			`break`

			`if pdf_url:`
			`user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"`

			`headers = {`
			`"User-Agent": user_agent,`
			`}`

			`response = None`
			`if pdf_url.startswith("https://"):`
			`response = requests.get(pdf_url, headers=headers, verify=False)`
			`else:`
			`response = requests.get(pdf_url, headers=headers)`

			`# detect failure`
			`if response.status_code == 401:`
			`phenny.say("HTTP 401 unauthorized " + str(pdf_url))`
			`continue`
			`elif response.status_code != 200:`
			`phenny.say("HTTP " + str(response.status_code) + " " + str(pdf_url))`
			`continue`

			`data = response.content`

			`# grr..`
			`title = title.encode("ascii", "ignore")`

			`path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + ".pdf")`

			`file_handler = open(path, "w")`
			`file_handler.write(data)`
			`file_handler.close()`

			`filename = requests.utils.quote(title)`
			`url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf"`

			`phenny.say(url)`
			`continue`
			`elif verbose and explicit:`
			`phenny.say(download_url(line))`
			`continue`
			`elif verbose and explicit:`
			`phenny.say(download_url(line))`
			`continue`
			`elif verbose and explicit:`
			`if response.status_code == 501:`
			`if verbose:`
			`phenny.say("no translator available, raw dump: " + download_url(line))`
			`continue`
			`else:`
			`if verbose:`
			`phenny.say("error: HTTP " + str(response.status_code) + " " + download_url(line))`
			`continue`
			`else:`
			`continue`
			`return`
initial commit 2013-01-08 07:27:46 +01:00			`download.commands = ["fetch", "get", "download"]`
			`download.priority = "high"`
			`download.rule = r'(.*)'`

default to downloading the url 2013-01-10 19:16:55 +01:00			`def download_ieee(url):`
			`"""`
			`Downloads an IEEE paper. The Zotero translator requires frames/windows to`
			`be available. Eventually translation-server will be fixed, but until then`
			`it might be nice to have an IEEE workaround.`
			`"""`
			`# url = "http://ieeexplore.ieee.org:80/xpl/freeabs_all.jsp?reload=true&arnumber=901261"`
			`# url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261"`
			`raise NotImplementedError`

			`def download_url(url):`
			`response = requests.get(url, headers={"User-Agent": "origami-pdf"})`
			`content = response.content`
pedantic whitespace changes 2013-01-11 00:14:41 +01:00
attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`# just make up a default filename`
default to downloading the url 2013-01-10 19:16:55 +01:00			`title = "%0.2x" % random.getrandbits(128)`

attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`# default extension`
			`extension = ".txt"`

			`if "pdf" in response.headers["content-type"]:`
			`extension = ".pdf"`
			`elif check_if_html(response):`
			`# parse the html string with lxml.etree`
			`tree = parse_html(content)`

			`# extract some metadata with xpaths`
			`citation_pdf_url = find_citation_pdf_url(tree, url)`
			`citation_title = find_citation_title(tree)`

better support for aip.org 2013-02-07 02:57:42 +01:00			`# wow, this seriously needs to be cleaned up`
make paperbot less verbose 2013-01-16 09:20:55 +01:00			`if citation_pdf_url and citation_title:`
don't encode the title until later 2013-01-16 09:43:20 +01:00			`citation_title = citation_title.encode("ascii", "ignore")`
fix title encoding for another pdf case 2013-01-16 09:25:04 +01:00			`response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})`
attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`content = response.content`
			`if "pdf" in response.headers["content-type"]:`
			`extension = ".pdf"`
			`title = citation_title`
			`else:`
fail less catastrophically for a weird sciencedirect url 2013-01-24 02:58:24 +01:00			`if "sciencedirect.com" in url and not "ShoppingCart" in url:`
possibly better sciencedirect handling 2013-01-24 03:01:03 +01:00			`try:`
			`title = tree.xpath("//h1[@class='svTitle']")[0].text`
			`pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]`
			`new_response = requests.get(pdf_url, headers={"User-Agent": "sdf-macross"})`
			`new_content = new_response.content`
			`if "pdf" in new_response.headers["content-type"]:`
			`extension = ".pdf"`
			`except Exception:`
			`pass`
			`else:`
			`content = new_content`
			`response = new_response`
better support for aip.org 2013-02-07 02:57:42 +01:00			`elif "apl.aip.org" in url:`
			`try:`
			`title = tree.xpath("//title/text()")[0].split(" \| ")[0]`
			`pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]`
			`new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})`
			`new_content = new_response.content`
			`if "pdf" in new_response.headers["content-type"]:`
			`extension = ".pdf"`
			`except Exception:`
			`pass`
			`else:`
			`content = new_content`
			`response = new_response`
fix sciencedirect.com parsing 2013-01-21 05:19:57 +01:00			`elif "h1 class=\"articleTitle" in content:`
fix some bugs for pubs.acs.org 2013-01-16 09:53:18 +01:00			`try:`
fix xpath syntax 2013-01-16 09:56:06 +01:00			`title = tree.xpath("//h1[@class='articleTitle']")[0].text`
fix some bugs for pubs.acs.org 2013-01-16 09:53:18 +01:00			`title = title.encode("ascii", "ignore")`
			`pdf_url = tree.xpath("//a[@title='View the Full Text PDF']/@href")[0]`
			`except:`
			`pass`
			`else:`
			`if pdf_url.startswith("/"):`
			`url_start = url[:url.find("/",8)]`
			`pdf_url = url_start + pdf_url`
			`response = requests.get(pdf_url, headers={"User-Agent": "pdf-teapot"})`
			`content = response.content`
			`if "pdf" in response.headers["content-type"]:`
			`extension = ".pdf"`
			`# raise Exception("problem with citation_pdf_url or citation_title")`
			`# well, at least save the contents from the original url`
			`pass`
attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00
make paper titles with slashes work 2013-01-21 18:44:13 +01:00			`# can't create directories`
			`title = title.replace("/", "_")`

attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + extension)`
default to downloading the url 2013-01-10 19:16:55 +01:00
			`file_handler = open(path, "w")`
			`file_handler.write(content)`
			`file_handler.close()`

attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`url = "http://diyhpl.us/~bryan/papers2/paperbot/" + requests.utils.quote(title) + extension`
default to downloading the url 2013-01-10 19:16:55 +01:00
			`return url`

attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`def parse_html(content):`
			`if not isinstance(content, StringIO):`
			`content = StringIO(content)`
			`parser = lxml.etree.HTMLParser()`
			`tree = lxml.etree.parse(content, parser)`
			`return tree`

			`def check_if_html(response):`
			`return "text/html" in response.headers["content-type"]`

			`def find_citation_pdf_url(tree, url):`
			`"""`
			`Returns the <meta name="citation_pdf_url"> content attribute.`
			`"""`
			`citation_pdf_url = extract_meta_content(tree, "citation_pdf_url")`
better case handling for find_citation_pdf_url 2013-01-16 09:41:56 +01:00			`if citation_pdf_url and not citation_pdf_url.startswith("http"):`
attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`if citation_pdf_url.startswith("/"):`
			`url_start = url[:url.find("/",8)]`
			`citation_pdf_url = url_start + citation_pdf_url`
			`else:`
			`raise Exception("unhandled situation (citation_pdf_url)")`
			`return citation_pdf_url`

			`def find_citation_title(tree):`
			`"""`
			`Returns the <meta name="citation_title"> content attribute.`
			`"""`
			`citation_title = extract_meta_content(tree, "citation_title")`
			`return citation_title`

			`def extract_meta_content(tree, meta_name):`
			`try:`
			`content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0]`
			`except:`
			`return None`
			`else:`
			`return content`

auto-remove proxy.lib.pdx.edu from urls 2013-01-20 02:33:17 +01:00			`def filter_fix(url):`
			`"""`
			`Fixes some common problems in urls.`
			`"""`
			`if ".proxy.lib.pdx.edu" in url:`
			`url = url.replace(".proxy.lib.pdx.edu", "")`
			`return url`

handle ieee xplore login.jsp urls 2013-01-22 02:11:12 +01:00			`def fix_ieee_login_urls(url):`
			`"""`
			`Fixes urls point to login.jsp on IEEE Xplore. When someone browses to the`
			`abstracts page on IEEE Xplore, they are sometimes sent to the login.jsp`
			`page, and then this link is given to paperbot. The actual link is based on`
			`the arnumber.`

			`example:`
			`http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=806324&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D806324`
			`"""`
			`if "ieeexplore.ieee.org/xpl/login.jsp" in url:`
			`if "arnumber=" in url:`
			`parts = url.split("arnumber=")`

			`# i guess the url might not look like the example in the docstring`
			`if "&" in parts[1]:`
			`arnumber = parts[1].split("&")[0]`
			`else:`
			`arnumber = parts[1]`

			`return "http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=" + arnumber`

			`# default case when things go wrong`
			`return url`