From 357873d10c82638b11f9afe11dd55b0a6588954c Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Thu, 24 Dec 2015 20:34:34 +0100 Subject: [PATCH] Basic API to put and fetch some papers --- database.py | 40 +++++++++ fetch_references.py | 2 +- main.py | 67 ++++++++++----- reference_fetcher/arxiv.py | 46 ++++++++++- routes/__init__.py | 4 + routes/get.py | 161 +++++++++++++++++++++++++++++++++++++ routes/post.py | 113 ++++++++++++++++++++++++++ tools.py | 14 ++++ 8 files changed, 424 insertions(+), 23 deletions(-) create mode 100644 database.py create mode 100644 routes/__init__.py create mode 100644 routes/get.py create mode 100644 routes/post.py create mode 100644 tools.py diff --git a/database.py b/database.py new file mode 100644 index 0000000..9b79f17 --- /dev/null +++ b/database.py @@ -0,0 +1,40 @@ +""" +This file contains the database schema in SQLAlchemy format. +""" +from sqlalchemy import Column, Integer, String +from sqlalchemy.ext.declarative import declarative_base + +Base = declarative_base() + + +class Paper(Base): + __tablename__ = 'papers' + id = Column(Integer, primary_key=True) + doi = Column(String(), nullable=True, unique=True) + arxiv_id = Column(String(25), nullable=True, unique=True) + + def __repr__(self): + return "" % ( + self.id, + self.doi, + self.arxiv_id, + ) + + def json_api_repr(self): + """ + Dict to dump for the JSON API. + """ + return { + "types": self.__tablename__, + "id": self.id, + "attributes": { + "doi": self.doi, + "arxiv_id": self.arxiv_id, + }, + "links": { + "self": "/papers/%d" % (self.id,) + }, + "relationships": { + # TODO + } + } diff --git a/fetch_references.py b/fetch_references.py index 01f5641..e68e014 100755 --- a/fetch_references.py +++ b/fetch_references.py @@ -15,4 +15,4 @@ if __name__ == "__main__": if os.path.isfile(sys.argv[1]): pprint.pprint(bbl.get_dois(sys.argv[1])) else: - pprint.pprint(arxiv.get_dois(sys.argv[1])) + pprint.pprint(arxiv.get_cited_dois(sys.argv[1])) diff --git a/main.py b/main.py index b0f557d..f861e10 100755 --- a/main.py +++ b/main.py @@ -1,31 +1,56 @@ #!/usr/bin/env python3 -from bottle import get, post, run +import bottle from bottle.ext import sqlalchemy -from sqlalchemy import create_engine, Column, Integer, Sequence, String -from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy import create_engine, event +from sqlalchemy.engine import Engine + +import database +import routes + +# Initialize db and include the SQLAlchemy plugin in bottle +engine = create_engine('sqlite:///:memory:', echo=True) + +app = bottle.Bottle() +plugin = sqlalchemy.Plugin( + # SQLAlchemy engine created with create_engine function. + engine, + # SQLAlchemy metadata, required only if create=True. + database.Base.metadata, + # Keyword used to inject session database in a route (default 'db'). + keyword='db', + # If it is true, execute `metadata.create_all(engine)` when plugin is + # applied (default False). + create=True, + # If it is true, plugin commit changes after route is executed (default + # True). + commit=True, + # If it is true and keyword is not defined, plugin uses **kwargs argument + # to inject session database (default False). + use_kwargs=False +) + +app.install(plugin) -@get("/doi/") -def doi(doi): - """ - GET /doi/ - - {} - """ - # TODO - pass +# Auto enable foreign keys for SQLite +@event.listens_for(Engine, "connect") +def set_sqlite_pragma(dbapi_connection, connection_record): + cursor = dbapi_connection.cursor() + cursor.execute("PRAGMA foreign_keys=ON") + cursor.close() -@post("/doi/") -def doi_post(doi): - """ - POST /doi/ +# Routes +app.get("/papers", callback=routes.get.fetch_papers) +app.get("/papers/", callback=routes.get.fetch_by_id) - {} - """ - # TODO - pass +# TODO: Fetch relationships + + +app.post("/papers", callback=routes.post.create_paper) + +# TODO: Update relationships if __name__ == "__main__": - run(host='localhost', port=8080, debug=True) + app.run(host='localhost', port=8080, debug=True) diff --git a/reference_fetcher/arxiv.py b/reference_fetcher/arxiv.py index f323d20..1293a8c 100644 --- a/reference_fetcher/arxiv.py +++ b/reference_fetcher/arxiv.py @@ -4,6 +4,7 @@ This file contains all the arXiv-specific functions. import io import requests import tarfile +import xml.etree.ElementTree from . import bbl @@ -38,7 +39,7 @@ def bbl_from_arxiv(eprint): return bbl_files -def get_dois(eprint): +def get_cited_dois(eprint): """ Get the .bbl files (if any) of a given preprint. @@ -52,3 +53,46 @@ def get_dois(eprint): for bbl_file in bbl_files: dois.update(bbl.get_dois(bbl_file)) return dois + + +def get_arxiv_eprint_from_doi(doi): + """ + Get the arXiv eprint id for a given DOI. + + Params: + - doi is the DOI of the resource to look for. + + Returns the arXiv eprint id, or None if not found. + """ + r = requests.get("http://export.arxiv.org/api/query", + params={ + "search_query": "doi:%s" % (doi,), + "max_results": 1 + }) + e = xml.etree.ElementTree.fromstring(r.content) + for entry in e.iter("{http://www.w3.org/2005/Atom}entry"): + id = entry.find("{http://www.w3.org/2005/Atom}id").text + return id.replace("http://arxiv.org/abs/", "") + return None + + +def get_doi(eprint): + """ + Get the associated DOI for a given arXiv eprint. + + Params: + - eprint is the arXiv eprint id. + + Returns the DOI if any, or None. + """ + r = requests.get("http://export.arxiv.org/api/query", + params={ + "id_list": eprint, + "max_results": 1 + }) + e = xml.etree.ElementTree.fromstring(r.content) + for entry in e.iter("{http://www.w3.org/2005/Atom}entry"): + doi = entry.find("{http://arxiv.org/schemas/atom}doi") + if doi is not None: + return doi.text + return None diff --git a/routes/__init__.py b/routes/__init__.py new file mode 100644 index 0000000..121b1fb --- /dev/null +++ b/routes/__init__.py @@ -0,0 +1,4 @@ +from . import get +from . import post + +__all__ = ["get", "post"] diff --git a/routes/get.py b/routes/get.py new file mode 100644 index 0000000..a74c319 --- /dev/null +++ b/routes/get.py @@ -0,0 +1,161 @@ +""" +This file contains GET routes methods. +""" +import bottle + +import database +import tools + + +def fetch_papers(db): + """ + Fetch all matching papers. + + ``` + GET /papers + Accept: application/vnd.api+json + ``` + + Filtering is possible using `id=ID`, `doi=DOI`, `arxiv_id=ARXIV_ID` or any + combination of these GET parameters. Other parameters are ignored. + + ``` + { + "data": [ + { + "type": "papers", + "id": 1, + "attributes": { + "doi": "TODO", + "arxiv_id": "TODO" + }, + "links": { + "self": "TODO" + }, + "relationships": { + } + } + ] + } + ``` + """ + filters = {k: bottle.request.params[k] + for k in bottle.request.params + if k in ["id", "doi", "arxiv_id"]} + resources = db.query(database.Paper).filter_by(**filters).all() + if resources: + return tools.pretty_json({ + "data": [resource.json_api_repr() for resource in resources] + }) + return bottle.HTTPError(404, "Not found") + + +def fetch_by_id(id, db): + """ + Fetch a resource identified by its internal id. + + ``` + GET /id/ + Accept: application/vnd.api+json + ``` + + ``` + { + "data": { + { + "type": "papers", + "id": 1, + "attributes": { + "doi": "TODO", + "arxiv_id": "TODO" + }, + "links": { + "self": "TODO" + }, + "relationships": { + } + } + } + } + ``` + """ + resource = db.query(database.Paper).filter_by(id=id).first() + if resource: + return tools.pretty_json({ + "data": resource.json_api_repr() + }) + return bottle.HTTPError(404, "Not found") + + +def fetch_by_doi(doi, db): + """ + Fetch a resource identified by its DOI. + + ``` + GET /doi/ + Accept: application/vnd.api+json + ``` + + ``` + { + "data": { + { + "type": "papers", + "id": 1, + "attributes": { + "doi": "TODO", + "arxiv_id": "TODO" + }, + "links": { + "self": "TODO" + }, + "relationships": { + } + } + } + } + ``` + """ + resource = db.query(database.Paper).filter_by(doi=doi).first() + if resource: + return tools.pretty_json({ + "data": resource.json_api_repr() + }) + return bottle.HTTPError(404, "Not found") + + +def fetch_by_arxiv(arxiv, db): + """ + Fetch a resource identified by its arXiv eprint ID. + + ``` + GET /arxiv/ + Accept: application/vnd.api+json + ``` + + ``` + { + "data": { + { + "type": "papers", + "id": 1, + "attributes": { + "doi": "TODO", + "arxiv_id": "TODO" + }, + "links": { + "self": "TODO" + }, + "relationships": { + } + } + } + } + ``` + """ + resource = db.query(database.Paper).filter_by(arxiv_id=arxiv).first() + if resource: + return tools.pretty_json({ + "data": resource.json_api_repr() + }) + return bottle.HTTPError(404, "Not found") diff --git a/routes/post.py b/routes/post.py new file mode 100644 index 0000000..9ff4284 --- /dev/null +++ b/routes/post.py @@ -0,0 +1,113 @@ +""" +This file contains POST routes methods. +""" +import bottle +import json +from sqlalchemy.exc import IntegrityError + +import database +import tools +from reference_fetcher import arxiv + + +def create_paper(db): + """ + Create a new resource identified by its DOI or arXiv eprint id. + + ``` + POST /papers + Content-Type: application/vnd.api+json + Accept: application/vnd.api+json + + { + "data": { + "doi": "DOI", + // OR + "arxiv_id": "ARXIV_ID" + } + } + ``` + + ``` + {} TODO + ``` + """ + data = json.loads(bottle.request.body.read().decode("utf-8")) + # Validate the request + if("data" not in data or + "type" not in data["data"] or + data["data"]["type"] != "papers" or + ("doi" not in data["data"] and "arxiv_id" not in data["data"])): + return bottle.HTTPError(403, "Forbidden") + + data = data["data"] + + if "doi" in data: + paper = create_by_doi(data["doi"], db) + elif "arxiv_id" in data: + paper = create_by_arxiv(data["arxiv"], db) + + if paper is None: + return bottle.HTTPError(409, "Conflict") + + # Return the resource + response = { + "data": paper.json_api_repr() + } + # Note: Return a 202 as the resource has been accepted but is not yet + # processed, especially since its relationships have not yet been fetched. + # TODO: Redirection + return bottle.HTTPResponse(status=202, body=tools.pretty_json(response)) + + +def create_by_doi(doi, db): + """ + Create a new resource identified by its DOI, if it does not exist. + + Return None if insertion failed, the Paper object otherwise. + """ + paper = database.Paper(doi=doi) + + # Try to fetch an arXiv id + arxiv_id = arxiv.get_arxiv_eprint_from_doi(doi) + if arxiv_id: + paper.arxiv_id = arxiv_id + + # Add it to the database + try: + db.add(paper) + db.flush() + except IntegrityError: + # Unique constraint violation, paper already exists + db.rollback() + return None + + # Return the paper + return paper + + +def create_by_arxiv(arxiv, db): + """ + Create a new resource identified by its arXiv eprint ID, if it does not + exist. + + Return None if insertion failed, the Paper object otherwise. + """ + paper = database.Paper(arxiv_id=arxiv) + + # Try to fetch an arXiv id + doi = arxiv.get_doi(arxiv) + if doi: + paper.doi = doi + + # Add it to the database + try: + db.add(paper) + db.flush() + except IntegrityError: + # Unique constraint violation, paper already exists + db.rollback() + return None + + # Return the paper + return paper diff --git a/tools.py b/tools.py new file mode 100644 index 0000000..b42e1c5 --- /dev/null +++ b/tools.py @@ -0,0 +1,14 @@ +""" +Various utility functions. +""" +import json + + +def pretty_json(data): + """ + Return pretty printed JSON-formatted string. + """ + return json.dumps(data, + sort_keys=True, + indent=4, + separators=(',', ': '))