bmc/papers.py

#!/usr/bin/python
"""
Fetches papers.
"""
import re
import os
import json
import params
import random
import requests
import lxml.etree
import sys
from StringIO import StringIO

import pdfparanoia

def download(line, verbose=True):
    """
    Downloads a paper.
    """

    # don't bother if there's nothing there
    if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
        return
    for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
        line = filter_fix(line)

        # fix for login.jsp links to ieee xplore
        line = fix_ieee_login_urls(line)
        line = fix_jstor_pdf_urls(line)

        translation_url = params.server

        headers = {
            "Content-Type": "application/json",
        }

        data = {
            "url": line,
            "sessionid": "what"
        }

        data = json.dumps(data)

        response = requests.post(translation_url, data=data, headers=headers)

        if response.status_code == 200 and response.content != "[]":
            # see if there are any attachments
            content = json.loads(response.content)
            item = content[0]
            title = item["title"]

            if item.has_key("attachments"):
                pdf_url = None
                for attachment in item["attachments"]:
                    if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
                        pdf_url = attachment["url"]
                        break

                if pdf_url:
                    user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"

                    headers = {
                        "User-Agent": user_agent,
                    }

                    response = None
                    if pdf_url.startswith("https://"):
                        response = requests.get(pdf_url, headers=headers, verify=False)
                    else:
                        response = requests.get(pdf_url, headers=headers)

                    # detect failure
                    if response.status_code == 401:
                        print("HTTP 401 unauthorized " + str(pdf_url))
                        continue
                    elif response.status_code != 200:
                        print("HTTP " + str(response.status_code) + " " + str(pdf_url))
                        continue

                    data = response.content

                    if "pdf" in response.headers["content-type"]:
                        try:
                            data = pdfparanoia.scrub(StringIO(data))
                        except:
                            # this is to avoid a PDFNotImplementedError
                            pass

                    # grr..
                    title = title.encode("ascii", "ignore")

                    path = os.path.join(params.folder, title + ".pdf")

                    file_handler = open(path, "w")
                    file_handler.write(data)
                    file_handler.close()

                    filename = requests.utils.quote(title)

                    # Remove an ending period, which sometimes happens when the
                    # title of the paper has a period at the end.
                    if filename[-1] == ".":
                        filename = filename[:-1]

                    url = params.url + filename + ".pdf"

                    print(url)
                    continue
                else:
                    print(download_url(line))
                    continue
            else:
                print(download_url(line))
                continue
        else:
            if response.status_code == 501:
                if verbose:
                    print("no translator available, raw dump: " + download_url(line))
            else:
                if verbose:
                    print("error: HTTP " + str(response.status_code) + " " + download_url(line))
    return
download.commands = ["fetch", "get", "download"]
download.priority = "high"
download.rule = r'(.*)'

def download_ieee(url):
    """
    Downloads an IEEE paper. The Zotero translator requires frames/windows to
    be available. Eventually translation-server will be fixed, but until then
    it might be nice to have an IEEE workaround.
    """
    # url = "http://ieeexplore.ieee.org:80/xpl/freeabs_all.jsp?reload=true&arnumber=901261"
    # url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261"
    raise NotImplementedError

def download_url(url):
    response = requests.get(url, headers={"User-Agent": "origami-pdf"})
    content = response.content

    # just make up a default filename
    title = "%0.2x" % random.getrandbits(128)

    # default extension
    extension = ".txt"

    if "pdf" in response.headers["content-type"]:
        extension = ".pdf"
    elif check_if_html(response):
        # parse the html string with lxml.etree
        tree = parse_html(content)

        # extract some metadata with xpaths
        citation_pdf_url = find_citation_pdf_url(tree, url)
        citation_title = find_citation_title(tree)

        # aip.org sucks, citation_pdf_url is wrong
        if citation_pdf_url and "link.aip.org/" in citation_pdf_url:
            citation_pdf_url = None

        if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url:
            content = requests.get(citation_pdf_url).content
            tree = parse_html(content)
            # citation_title = ...

        # wow, this seriously needs to be cleaned up
        if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url:
            citation_title = citation_title.encode("ascii", "ignore")
            response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
            content = response.content
            if "pdf" in response.headers["content-type"]:
                extension = ".pdf"
                title = citation_title
        else:
            if "sciencedirect.com" in url and not "ShoppingCart" in url:
                try:
                    title = tree.xpath("//h1[@class='svTitle']")[0].text
                    pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
                    new_response = requests.get(pdf_url, headers={"User-Agent": "sdf-macross"})
                    new_content = new_response.content
                    if "pdf" in new_response.headers["content-type"]:
                        extension = ".pdf"
                except Exception:
                    pass
                else:
                    content = new_content
                    response = new_response
            elif "jstor.org/" in url:
                # clean up the url
                if "?" in url:
                    url = url[0:url.find("?")]

                # not all pages have the <input type="hidden" name="ppv-title"> element
                try:
                    title = tree.xpath("//div[@class='hd title']")[0].text
                except Exception:
                    try:
                        title = tree.xpath("//input[@name='ppv-title']/@value")[0]
                    except Exception:
                        pass

                # get the document id
                document_id = None
                if url[-1] != "/":
                    #if "stable/" in url:
                    #elif "discover/" in url:
                    #elif "action/showShelf?candidate=" in url:
                    #elif "pss/" in url:
                    document_id = url.split("/")[-1]

                if document_id.isdigit():
                    try:
                        pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true"
                        new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.1"})
                        new_content = new_response.content
                        if "pdf" in new_response.headers["content-type"]:
                            extension = ".pdf"
                    except Exception:
                        pass
                    else:
                        content = new_content
                        response = new_response
            elif ".aip.org/" in url:
                try:
                    title = tree.xpath("//title/text()")[0].split(" | ")[0]
                    pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]
                    new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
                    new_content = new_response.content
                    if "pdf" in new_response.headers["content-type"]:
                        extension = ".pdf"
                except Exception:
                    pass
                else:
                    content = new_content
                    response = new_response
            elif "ieeexplore.ieee.org" in url:
                try:
                    pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0]
                    new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/2.0"})
                    new_content = new_response.content
                    if "pdf" in new_response.headers["content-type"]:
                        extension = ".pdf"
                except Exception:
                    pass
                else:
                    content = new_content
                    response = new_response
            elif "h1 class=\"articleTitle" in content:
                try:
                    title = tree.xpath("//h1[@class='articleTitle']")[0].text
                    title = title.encode("ascii", "ignore")
                    pdf_url = tree.xpath("//a[@title='View the Full Text PDF']/@href")[0]
                except:
                    pass
                else:
                    if pdf_url.startswith("/"):
                        url_start = url[:url.find("/",8)]
                        pdf_url = url_start + pdf_url
                    response = requests.get(pdf_url, headers={"User-Agent": "pdf-teapot"})
                    content = response.content
                    if "pdf" in response.headers["content-type"]:
                        extension = ".pdf"
            # raise Exception("problem with citation_pdf_url or citation_title")
            # well, at least save the contents from the original url
            pass

    # make the title again just in case
    if not title:
        title = "%0.2x" % random.getrandbits(128)

    # can't create directories
    title = title.replace("/", "_")
    title = title.replace(" ", "_")
    title = title[:20]

    path = os.path.join(params.folder, title + extension)

    if extension in [".pdf", "pdf"]:
        try:
            content = pdfparanoia.scrub(StringIO(content))
        except:
            # this is to avoid a PDFNotImplementedError
            pass

    file_handler = open(path, "w")
    file_handler.write(content)
    file_handler.close()

    title = title.encode("ascii", "ignore")
    url = params.url + requests.utils.quote(title) + extension

    return url

def parse_html(content):
    if not isinstance(content, StringIO):
        content = StringIO(content)
    parser = lxml.etree.HTMLParser()
    tree = lxml.etree.parse(content, parser)
    return tree

def check_if_html(response):
    return "text/html" in response.headers["content-type"]

def find_citation_pdf_url(tree, url):
    """
    Returns the <meta name="citation_pdf_url"> content attribute.
    """
    citation_pdf_url = extract_meta_content(tree, "citation_pdf_url")
    if citation_pdf_url and  not citation_pdf_url.startswith("http"):
        if citation_pdf_url.startswith("/"):
            url_start = url[:url.find("/",8)]
            citation_pdf_url = url_start + citation_pdf_url
        else:
            raise Exception("unhandled situation (citation_pdf_url)")
    return citation_pdf_url

def find_citation_title(tree):
    """
    Returns the <meta name="citation_title"> content attribute.
    """
    citation_title = extract_meta_content(tree, "citation_title")
    return citation_title

def extract_meta_content(tree, meta_name):
    try:
        content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0]
    except:
        return None
    else:
        return content

def filter_fix(url):
    """
    Fixes some common problems in urls.
    """
    if ".proxy.lib.pdx.edu" in url:
        url = url.replace(".proxy.lib.pdx.edu", "")
    return url

def fix_ieee_login_urls(url):
    """
    Fixes urls point to login.jsp on IEEE Xplore. When someone browses to the
    abstracts page on IEEE Xplore, they are sometimes sent to the login.jsp
    page, and then this link is given to paperbot. The actual link is based on
    the arnumber.

    example:
    http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=806324&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D806324
    """
    if "ieeexplore.ieee.org/xpl/login.jsp" in url:
        if "arnumber=" in url:
            parts = url.split("arnumber=")

            # i guess the url might not look like the example in the docstring
            if "&" in parts[1]:
                arnumber = parts[1].split("&")[0]
            else:
                arnumber = parts[1]

            return "http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=" + arnumber

    # default case when things go wrong
    return url

def fix_jstor_pdf_urls(url):
    """
    Fixes urls pointing to jstor pdfs.
    """
    if "jstor.org/" in url:
        if ".pdf" in url and not "?acceptTC=true" in url:
            url += "?acceptTC=true"
    return url

if __name__ == '__main__':
  if len(sys.argv) > 1:
    for a in sys.argv[1:]:
      download(a)
  else:
    while True:
      l = sys.stdin.readline()
      if not l:
        break
      download(l)
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`#!/usr/bin/python`
initial commit 2013-01-08 07:27:46 +01:00			`"""`
			`Fetches papers.`
			`"""`
attempt at multi URL downloads 2013-01-11 06:08:36 +01:00			`import re`
initial commit 2013-01-08 07:27:46 +01:00			`import os`
			`import json`
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`import params`
default to downloading the url 2013-01-10 19:16:55 +01:00			`import random`
initial commit 2013-01-08 07:27:46 +01:00			`import requests`
attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`import lxml.etree`
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`import sys`
attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`from StringIO import StringIO`
initial commit 2013-01-08 07:27:46 +01:00
use pdfparanoia to remove watermarks 2013-02-09 14:41:50 +01:00			`import pdfparanoia`

remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`def download(line, verbose=True):`
initial commit 2013-01-08 07:27:46 +01:00			`"""`
			`Downloads a paper.`
			`"""`

			`# don't bother if there's nothing there`
			`if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):`
			`return`
tabs to spaces 2013-01-11 06:14:16 +01:00			`for line in re.findall('http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):`
auto-remove proxy.lib.pdx.edu from urls 2013-01-20 02:33:17 +01:00			`line = filter_fix(line)`

handle ieee xplore login.jsp urls 2013-01-22 02:11:12 +01:00			`# fix for login.jsp links to ieee xplore`
			`line = fix_ieee_login_urls(line)`
fix jstor pdf urls 2013-02-22 00:13:22 +01:00			`line = fix_jstor_pdf_urls(line)`
handle ieee xplore login.jsp urls 2013-01-22 02:11:12 +01:00
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`translation_url = params.server`
tabs to spaces 2013-01-11 06:14:16 +01:00
			`headers = {`
			`"Content-Type": "application/json",`
			`}`

			`data = {`
			`"url": line,`
			`"sessionid": "what"`
			`}`

			`data = json.dumps(data)`

			`response = requests.post(translation_url, data=data, headers=headers)`

skip translator results if [] or not http 200 2013-02-18 12:17:57 +01:00			`if response.status_code == 200 and response.content != "[]":`
tabs to spaces 2013-01-11 06:14:16 +01:00			`# see if there are any attachments`
			`content = json.loads(response.content)`
			`item = content[0]`
			`title = item["title"]`

			`if item.has_key("attachments"):`
			`pdf_url = None`
			`for attachment in item["attachments"]:`
			`if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:`
			`pdf_url = attachment["url"]`
			`break`

			`if pdf_url:`
			`user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"`

			`headers = {`
			`"User-Agent": user_agent,`
			`}`

			`response = None`
			`if pdf_url.startswith("https://"):`
			`response = requests.get(pdf_url, headers=headers, verify=False)`
			`else:`
			`response = requests.get(pdf_url, headers=headers)`

			`# detect failure`
			`if response.status_code == 401:`
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`print("HTTP 401 unauthorized " + str(pdf_url))`
tabs to spaces 2013-01-11 06:14:16 +01:00			`continue`
			`elif response.status_code != 200:`
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`print("HTTP " + str(response.status_code) + " " + str(pdf_url))`
tabs to spaces 2013-01-11 06:14:16 +01:00			`continue`

			`data = response.content`

scrub away watermarks in another situation 2013-02-09 14:42:57 +01:00			`if "pdf" in response.headers["content-type"]:`
catch PDFNotImplementedErrors 2013-02-11 17:03:28 +01:00			`try:`
			`data = pdfparanoia.scrub(StringIO(data))`
			`except:`
			`# this is to avoid a PDFNotImplementedError`
			`pass`
scrub away watermarks in another situation 2013-02-09 14:42:57 +01:00
tabs to spaces 2013-01-11 06:14:16 +01:00			`# grr..`
			`title = title.encode("ascii", "ignore")`

remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`path = os.path.join(params.folder, title + ".pdf")`
tabs to spaces 2013-01-11 06:14:16 +01:00
			`file_handler = open(path, "w")`
			`file_handler.write(data)`
			`file_handler.close()`

			`filename = requests.utils.quote(title)`
remove extra periods from filenames 2013-02-10 02:22:51 +01:00
			`# Remove an ending period, which sometimes happens when the`
			`# title of the paper has a period at the end.`
			`if filename[-1] == ".":`
			`filename = filename[:-1]`

remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`url = params.url + filename + ".pdf"`
tabs to spaces 2013-01-11 06:14:16 +01:00
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`print(url)`
tabs to spaces 2013-01-11 06:14:16 +01:00			`continue`
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`else:`
			`print(download_url(line))`
tabs to spaces 2013-01-11 06:14:16 +01:00			`continue`
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`else:`
			`print(download_url(line))`
tabs to spaces 2013-01-11 06:14:16 +01:00			`continue`
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`else:`
tabs to spaces 2013-01-11 06:14:16 +01:00			`if response.status_code == 501:`
			`if verbose:`
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`print("no translator available, raw dump: " + download_url(line))`
tabs to spaces 2013-01-11 06:14:16 +01:00			`else:`
			`if verbose:`
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`print("error: HTTP " + str(response.status_code) + " " + download_url(line))`
tabs to spaces 2013-01-11 06:14:16 +01:00			`return`
initial commit 2013-01-08 07:27:46 +01:00			`download.commands = ["fetch", "get", "download"]`
			`download.priority = "high"`
			`download.rule = r'(.*)'`

default to downloading the url 2013-01-10 19:16:55 +01:00			`def download_ieee(url):`
			`"""`
			`Downloads an IEEE paper. The Zotero translator requires frames/windows to`
			`be available. Eventually translation-server will be fixed, but until then`
			`it might be nice to have an IEEE workaround.`
			`"""`
			`# url = "http://ieeexplore.ieee.org:80/xpl/freeabs_all.jsp?reload=true&arnumber=901261"`
			`# url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261"`
			`raise NotImplementedError`

			`def download_url(url):`
			`response = requests.get(url, headers={"User-Agent": "origami-pdf"})`
			`content = response.content`
pedantic whitespace changes 2013-01-11 00:14:41 +01:00
attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`# just make up a default filename`
default to downloading the url 2013-01-10 19:16:55 +01:00			`title = "%0.2x" % random.getrandbits(128)`

attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`# default extension`
			`extension = ".txt"`

			`if "pdf" in response.headers["content-type"]:`
			`extension = ".pdf"`
			`elif check_if_html(response):`
			`# parse the html string with lxml.etree`
			`tree = parse_html(content)`

			`# extract some metadata with xpaths`
			`citation_pdf_url = find_citation_pdf_url(tree, url)`
			`citation_title = find_citation_title(tree)`

ignore citation_pdf_url on aip.org 2013-02-07 03:00:50 +01:00			`# aip.org sucks, citation_pdf_url is wrong`
prevent a catastrophic error in paper retrieval 2013-02-07 10:54:05 +01:00			`if citation_pdf_url and "link.aip.org/" in citation_pdf_url:`
ignore citation_pdf_url on aip.org 2013-02-07 03:00:50 +01:00			`citation_pdf_url = None`

citation_pdf_url is not always available 2013-02-08 22:23:29 +01:00			`if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url:`
an even better IEEE fix 2013-02-08 11:45:43 +01:00			`content = requests.get(citation_pdf_url).content`
			`tree = parse_html(content)`
citation_pdf_url is not always available 2013-02-08 22:23:29 +01:00			`# citation_title = ...`
an even better IEEE fix 2013-02-08 11:45:43 +01:00
better support for aip.org 2013-02-07 02:57:42 +01:00			`# wow, this seriously needs to be cleaned up`
an even better IEEE fix 2013-02-08 11:45:43 +01:00			`if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url:`
don't encode the title until later 2013-01-16 09:43:20 +01:00			`citation_title = citation_title.encode("ascii", "ignore")`
fix title encoding for another pdf case 2013-01-16 09:25:04 +01:00			`response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})`
attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`content = response.content`
			`if "pdf" in response.headers["content-type"]:`
			`extension = ".pdf"`
			`title = citation_title`
			`else:`
fail less catastrophically for a weird sciencedirect url 2013-01-24 02:58:24 +01:00			`if "sciencedirect.com" in url and not "ShoppingCart" in url:`
possibly better sciencedirect handling 2013-01-24 03:01:03 +01:00			`try:`
			`title = tree.xpath("//h1[@class='svTitle']")[0].text`
			`pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]`
			`new_response = requests.get(pdf_url, headers={"User-Agent": "sdf-macross"})`
			`new_content = new_response.content`
			`if "pdf" in new_response.headers["content-type"]:`
			`extension = ".pdf"`
			`except Exception:`
			`pass`
			`else:`
			`content = new_content`
			`response = new_response`
jstor 2013-02-22 00:11:28 +01:00			`elif "jstor.org/" in url:`
			`# clean up the url`
			`if "?" in url:`
			`url = url[0:url.find("?")]`

			`# not all pages have the <input type="hidden" name="ppv-title"> element`
			`try:`
fix jstor title determination 2013-02-22 00:29:53 +01:00			`title = tree.xpath("//div[@class='hd title']")[0].text`
jstor 2013-02-22 00:11:28 +01:00			`except Exception:`
fix jstor title determination 2013-02-22 00:29:53 +01:00			`try:`
			`title = tree.xpath("//input[@name='ppv-title']/@value")[0]`
			`except Exception:`
			`pass`
jstor 2013-02-22 00:11:28 +01:00
			`# get the document id`
			`document_id = None`
			`if url[-1] != "/":`
			`#if "stable/" in url:`
			`#elif "discover/" in url:`
			`#elif "action/showShelf?candidate=" in url:`
			`#elif "pss/" in url:`
			`document_id = url.split("/")[-1]`

			`if document_id.isdigit():`
			`try:`
			`pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true"`
fix jstor title determination 2013-02-22 00:29:53 +01:00			`new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.1"})`
jstor 2013-02-22 00:11:28 +01:00			`new_content = new_response.content`
			`if "pdf" in new_response.headers["content-type"]:`
			`extension = ".pdf"`
			`except Exception:`
			`pass`
			`else:`
			`content = new_content`
			`response = new_response`
support both jap.aip.org and apl.aip.org 2013-02-09 15:03:29 +01:00			`elif ".aip.org/" in url:`
better support for aip.org 2013-02-07 02:57:42 +01:00			`try:`
			`title = tree.xpath("//title/text()")[0].split(" \| ")[0]`
			`pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]`
			`new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})`
			`new_content = new_response.content`
			`if "pdf" in new_response.headers["content-type"]:`
			`extension = ".pdf"`
			`except Exception:`
			`pass`
			`else:`
			`content = new_content`
			`response = new_response`
better support for IEEE Xplore 2013-02-08 11:16:10 +01:00			`elif "ieeexplore.ieee.org" in url:`
			`try:`
			`pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0]`
			`new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/2.0"})`
			`new_content = new_response.content`
			`if "pdf" in new_response.headers["content-type"]:`
			`extension = ".pdf"`
			`except Exception:`
			`pass`
			`else:`
			`content = new_content`
			`response = new_response`
fix sciencedirect.com parsing 2013-01-21 05:19:57 +01:00			`elif "h1 class=\"articleTitle" in content:`
fix some bugs for pubs.acs.org 2013-01-16 09:53:18 +01:00			`try:`
fix xpath syntax 2013-01-16 09:56:06 +01:00			`title = tree.xpath("//h1[@class='articleTitle']")[0].text`
fix some bugs for pubs.acs.org 2013-01-16 09:53:18 +01:00			`title = title.encode("ascii", "ignore")`
			`pdf_url = tree.xpath("//a[@title='View the Full Text PDF']/@href")[0]`
			`except:`
			`pass`
			`else:`
			`if pdf_url.startswith("/"):`
			`url_start = url[:url.find("/",8)]`
			`pdf_url = url_start + pdf_url`
			`response = requests.get(pdf_url, headers={"User-Agent": "pdf-teapot"})`
			`content = response.content`
			`if "pdf" in response.headers["content-type"]:`
			`extension = ".pdf"`
			`# raise Exception("problem with citation_pdf_url or citation_title")`
			`# well, at least save the contents from the original url`
			`pass`
attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00
use a random title if title extraction fails 2013-04-15 08:13:49 +02:00			`# make the title again just in case`
			`if not title:`
			`title = "%0.2x" % random.getrandbits(128)`

make paper titles with slashes work 2013-01-21 18:44:13 +01:00			`# can't create directories`
			`title = title.replace("/", "_")`
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`title = title.replace(" ", "_")`
			`title = title[:20]`
make paper titles with slashes work 2013-01-21 18:44:13 +01:00
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`path = os.path.join(params.folder, title + extension)`
default to downloading the url 2013-01-10 19:16:55 +01:00
use pdfparanoia to remove watermarks 2013-02-09 14:41:50 +01:00			`if extension in [".pdf", "pdf"]:`
catch PDFNotImplementedErrors 2013-02-11 17:03:28 +01:00			`try:`
			`content = pdfparanoia.scrub(StringIO(content))`
			`except:`
			`# this is to avoid a PDFNotImplementedError`
			`pass`
use pdfparanoia to remove watermarks 2013-02-09 14:41:50 +01:00
default to downloading the url 2013-01-10 19:16:55 +01:00			`file_handler = open(path, "w")`
			`file_handler.write(content)`
			`file_handler.close()`

allow unicode in filenames when returning a url 2013-02-17 04:23:39 +01:00			`title = title.encode("ascii", "ignore")`
remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`url = params.url + requests.utils.quote(title) + extension`
default to downloading the url 2013-01-10 19:16:55 +01:00
			`return url`

attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`def parse_html(content):`
			`if not isinstance(content, StringIO):`
			`content = StringIO(content)`
			`parser = lxml.etree.HTMLParser()`
			`tree = lxml.etree.parse(content, parser)`
			`return tree`

			`def check_if_html(response):`
			`return "text/html" in response.headers["content-type"]`

			`def find_citation_pdf_url(tree, url):`
			`"""`
			`Returns the <meta name="citation_pdf_url"> content attribute.`
			`"""`
			`citation_pdf_url = extract_meta_content(tree, "citation_pdf_url")`
better case handling for find_citation_pdf_url 2013-01-16 09:41:56 +01:00			`if citation_pdf_url and not citation_pdf_url.startswith("http"):`
attempt pdfs when zotero fails me 2013-01-16 09:19:09 +01:00			`if citation_pdf_url.startswith("/"):`
			`url_start = url[:url.find("/",8)]`
			`citation_pdf_url = url_start + citation_pdf_url`
			`else:`
			`raise Exception("unhandled situation (citation_pdf_url)")`
			`return citation_pdf_url`

			`def find_citation_title(tree):`
			`"""`
			`Returns the <meta name="citation_title"> content attribute.`
			`"""`
			`citation_title = extract_meta_content(tree, "citation_title")`
			`return citation_title`

			`def extract_meta_content(tree, meta_name):`
			`try:`
			`content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0]`
			`except:`
			`return None`
			`else:`
			`return content`

auto-remove proxy.lib.pdx.edu from urls 2013-01-20 02:33:17 +01:00			`def filter_fix(url):`
			`"""`
			`Fixes some common problems in urls.`
			`"""`
			`if ".proxy.lib.pdx.edu" in url:`
			`url = url.replace(".proxy.lib.pdx.edu", "")`
			`return url`

handle ieee xplore login.jsp urls 2013-01-22 02:11:12 +01:00			`def fix_ieee_login_urls(url):`
			`"""`
			`Fixes urls point to login.jsp on IEEE Xplore. When someone browses to the`
			`abstracts page on IEEE Xplore, they are sometimes sent to the login.jsp`
			`page, and then this link is given to paperbot. The actual link is based on`
			`the arnumber.`

			`example:`
			`http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=806324&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D806324`
			`"""`
			`if "ieeexplore.ieee.org/xpl/login.jsp" in url:`
			`if "arnumber=" in url:`
			`parts = url.split("arnumber=")`

			`# i guess the url might not look like the example in the docstring`
			`if "&" in parts[1]:`
			`arnumber = parts[1].split("&")[0]`
			`else:`
			`arnumber = parts[1]`

			`return "http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=" + arnumber`

			`# default case when things go wrong`
			`return url`

fix jstor pdf urls 2013-02-22 00:13:22 +01:00			`def fix_jstor_pdf_urls(url):`
			`"""`
			`Fixes urls pointing to jstor pdfs.`
			`"""`
			`if "jstor.org/" in url:`
			`if ".pdf" in url and not "?acceptTC=true" in url:`
			`url += "?acceptTC=true"`
			`return url`

remove phenny, tweak some things 2013-05-11 11:57:28 +02:00			`if __name__ == '__main__':`
			`if len(sys.argv) > 1:`
			`for a in sys.argv[1:]:`
			`download(a)`
			`else:`
			`while True:`
			`l = sys.stdin.readline()`
			`if not l:`
			`break`
			`download(l)`