From 4a67e974b82462398e008bdf63dd98cbfd3f2f95 Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Sat, 26 Dec 2015 01:25:38 +0100 Subject: [PATCH] Add tags support Also fix misc bugs * Update README.md doc. * Fix relationships not shown in JSON API for a given paper if relationship was not referenced. * Handle tags --- README.md | 153 +++++++++++++++++++++++++++++++++++++++++++++++ database.py | 79 +++++++++++++++++------- main.py | 6 +- routes/delete.py | 30 +++++++--- routes/get.py | 119 ++++++++++++++++++++++++++++++++---- routes/post.py | 96 ++++++++++++++++++++++++++--- 6 files changed, 434 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index f5dce56..baea074 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,30 @@ Metadata for arXiv The goal of this repository is to provide a minimal API to put metadata on arXiv papers. +Disclaimer: This code is not scalable nor ready to run in production. In +particular, it might be error-prone, and do not try to be resilient and keep +trace of errors. It is here as a proof of concept and to back [this +article](TODO) with some code. However, the `reference_fetcher` part is working +quite well, and was able to extract most of the references from arXiv papers I +tested it on. Note that it is quite long to run it on a paper, mainly due to +the latency in [Crossref API](http://search.crossref.org/). + + +## Special thanks + +Under the hood, this code uses the wonderful [Crossref +API](http://search.crossref.org/) for reference parsing to DOI, which works +really well and with a very large index. + +It also uses the [Dissemin API](http://beta.dissem.in/) in the +`reference_fetcher` to try to find Open access versions of referenced papers. + +It works using the Open access [arXiv.org](http://arxiv.org) repository, +without which it would be really difficult to achieve similar thing, due to +paywalls and lack of sources. It also uses their +[API](http://arxiv.org/help/api) to fetch DOIs from arXiv id and conversely. + + ## Introduction Most of the published scientific papers are availabe online, as preprints. For @@ -218,6 +242,95 @@ Accept: application/vnd.api+json ``` +### Get tags + +``` +GET /tags +Accept: application/vnd.api+json +``` + +Filtering is possible using ``id=ID``, ``name=NAME`` or any combination of +these GET parameters. Other parameters are ignored. + +```json +{ + "data": [ + { + "type": "tags", + "id": 1, + "attributes": { + "name": "foobar", + }, + "links": { + "self": "/tags/1" + } + } + ] +} +``` + + +### Get a tag by id + +``` +GET /tag/1 +Accept: application/vnd.api+json +``` + +```json +{ + "data": { + "type": "papers", + "id": 1, + "attributes": { + "doi": "10.1126/science.1252319", + "arxiv_id": "1401.2910" + }, + "links": { + "self": "/papers/1" + }, + "relationships": { + "cite": { + "links": { + "related": "/papers/1/relationships/cite" + } + }, + … + } + } +} +``` + +### Create a tag + +``` +POST /tags +Content-Type: application/vnd.api+json +Accept: application/vnd.api+json + +{ + "data": { + "name": "foobar", + } +} +``` + +```json +{ + "data": { + "type": "tags", + "id": 1, + "attributes": { + "name": "foobar", + }, + "links": { + "self": "/tags/1" + } + } +} +``` + + ### Create a relationship between two papers ``` @@ -236,6 +349,26 @@ Accept: application/vnd.api+json Response is empty HTTP 204. +### Add a tag to a paper + +``` +POST /papers/1/relationships/tags +Content-Type: application/vnd.api+json +Accept: application/vnd.api+json + +{ + "data": [ + { "type": "tags", "id": "2" }, + ... + ] +} +``` + +`id` is the id of the tag, which has to be created previously. + +Response is empty HTTP 204. + + ### Delete a paper and associated relationships ``` @@ -264,6 +397,26 @@ Accept: application/vnd.api+json Response is empty HTTP 204. +### Deleting a tag for a paper + +``` +DELETE /papers/1/relationships/tags +Content-Type: application/vnd.api+json +Accept: application/vnd.api+json + +{ + "data": [ + { "type": "tags", "id": "2" }, + ... + ] +} +``` + +`id` is the id of the tag. + +Response is empty HTTP 204. + + ## Associated library `reference_fetcher` is a module you can use to: diff --git a/database.py b/database.py index 96bb217..8078c6c 100644 --- a/database.py +++ b/database.py @@ -4,7 +4,7 @@ This file contains the database schema in SQLAlchemy format. import sqlite3 from sqlalchemy import event -from sqlalchemy import Column, ForeignKey, Integer, String +from sqlalchemy import Column, ForeignKey, Integer, String, Table from sqlalchemy.engine import Engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import relationship as sqlalchemy_relationship @@ -24,9 +24,9 @@ def set_sqlite_pragma(dbapi_connection, connection_record): cursor.close() -class Association(Base): +class RelationshipAssociation(Base): # Relationships are to be read "left RELATION right" - __tablename__ = "association" + __tablename__ = "relationship_association" id = Column(Integer, primary_key=True) left_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE")) right_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE")) @@ -45,6 +45,12 @@ class Association(Base): back_populates="related_to", passive_deletes=True) +tag_association_table = Table( + 'tag_association', Base.metadata, + Column('paper_id', Integer, ForeignKey('papers.id', ondelete="CASCADE")), + Column('tag_id', Integer, ForeignKey('tags.id', ondelete="CASCADE")) +) + class Paper(Base): __tablename__ = "papers" @@ -52,15 +58,19 @@ class Paper(Base): doi = Column(String(), nullable=True, unique=True) arxiv_id = Column(String(25), nullable=True, unique=True) # related_to are papers related to this paper (this_paper R …) - related_to = sqlalchemy_relationship("Association", - foreign_keys="Association.left_id", + related_to = sqlalchemy_relationship("RelationshipAssociation", + foreign_keys="RelationshipAssociation.left_id", back_populates="left_paper", passive_deletes=True) # related_by are papers referenced by this paper (… R this_paper) - related_by = sqlalchemy_relationship("Association", - foreign_keys="Association.right_id", + related_by = sqlalchemy_relationship("RelationshipAssociation", + foreign_keys="RelationshipAssociation.right_id", back_populates="right_paper", passive_deletes=True) + # Tags relationship + tags = sqlalchemy_relationship("Tag", + secondary=tag_association_table, + backref="papers") def __repr__(self): return "" % ( @@ -69,11 +79,27 @@ class Paper(Base): self.arxiv_id, ) - def json_api_repr(self): + def json_api_repr(self, db): """ Dict to dump for the JSON API. """ - relationships = [a.relationship.name for a in self.related_to] + relationships = [i.name for i in db.query(Relationship).all()] + relationships_dict = { + k: { + "links": { + "related": ( + "/papers/%d/relationships/%s?reverse={reverse}" % + (self.id, k) + ) + } + } + for k in relationships + } + relationships_dict["tags"] = { + "links": { + "related": "/papers/%d/relationships/tags" % (self.id,) + } + } return { "types": self.__tablename__, "id": self.id, @@ -84,17 +110,7 @@ class Paper(Base): "links": { "self": "/papers/%d" % (self.id,) }, - "relationships": { - k: { - "links": { - "related": ( - "/papers/%d/relationships/%s?reverse={reverse}" % - (self.id, k) - ) - } - } - for k in relationships - } + "relationships": relationships_dict } @@ -102,6 +118,27 @@ class Relationship(Base): __tablename__ = "relationships" id = Column(Integer, primary_key=True) name = Column(String(), unique=True) - associations = sqlalchemy_relationship("Association", + associations = sqlalchemy_relationship("RelationshipAssociation", back_populates="relationship", passive_deletes=True) + + +class Tag(Base): + __tablename__ = "tags" + id = Column(Integer, primary_key=True) + name = Column(String(), unique=True) + + def json_api_repr(self): + """ + Dict to dump for the JSON API. + """ + return { + "types": self.__tablename__, + "id": self.id, + "attributes": { + "name": self.name, + }, + "links": { + "self": "/tags/%d" % (self.id,) + } + } diff --git a/main.py b/main.py index b568b23..7a8023c 100755 --- a/main.py +++ b/main.py @@ -41,7 +41,7 @@ def index(): })) app.get("/papers", callback=routes.get.fetch_papers) -app.get("/papers/", callback=routes.get.fetch_by_id) +app.get("/papers/", callback=routes.get.fetch_papers_by_id) app.get("/papers//relationships/", callback=routes.get.fetch_relationship) app.get("/papers//", @@ -51,8 +51,12 @@ app.route("/papers/", method="DELETE", app.route("/papers//relationships/", method="DELETE", callback=routes.delete.delete_relationship) +app.get("/tags", callback=routes.get.fetch_tags) +app.get("/tags/", callback=routes.get.fetch_tags_by_id) + app.post("/papers", callback=routes.post.create_paper) +app.post("/tags", callback=routes.post.create_tag) app.post("/papers//relationships/", callback=routes.post.update_relationships) diff --git a/routes/delete.py b/routes/delete.py index cb79a3c..d9b2c8e 100644 --- a/routes/delete.py +++ b/routes/delete.py @@ -73,13 +73,27 @@ def delete_relationship(id, name, db): return bottle.HTTPError(403, "Forbidden") # Delete all the requested relationships for i in data: - relationship = (db.query(database.Association) - .filter_by(left_id=id, right_id=i["id"]) - .filter(database.Relationship.name == name) - .first()) - if relationship is None: - # An error occurred => 403 - return bottle.HTTPError(403, "Forbidden") - db.delete(relationship) + if i["type"] == "tags": + # Handle tags separately + tag = db.query(database.Tag).filter_by(id=i["id"]).first() + paper = db.query(database.Paper).filter_by(id=id).first() + if paper is None or tag is None: + # An error occurred => 403 + return bottle.HTTPError(403, "Forbidden") + try: + paper.tags.remove(tag) + except ValueError: + # An error occurred => 403 + return bottle.HTTPError(403, "Forbidden") + db.flush() + else: + relationship = (db.query(database.RelationshipAssociation) + .filter_by(left_id=id, right_id=i["id"]) + .filter(database.Relationship.name == name) + .first()) + if relationship is None: + # An error occurred => 403 + return bottle.HTTPError(403, "Forbidden") + db.delete(relationship) # Return an empty 204 on success return tools.APIResponse(status=204, body="") diff --git a/routes/get.py b/routes/get.py index 7a38ef6..1977f6c 100644 --- a/routes/get.py +++ b/routes/get.py @@ -56,14 +56,14 @@ def fetch_papers(db): resources = db.query(database.Paper).filter_by(**filters).all() if resources: return tools.APIResponse(tools.pretty_json({ - "data": [resource.json_api_repr() for resource in resources] + "data": [resource.json_api_repr(db) for resource in resources] })) return bottle.HTTPError(404, "Not found") -def fetch_by_id(id, db): +def fetch_papers_by_id(id, db): """ - Fetch a resource identified by its internal id. + Fetch a paper identified by its internal id. .. code-block:: bash @@ -102,7 +102,7 @@ def fetch_by_id(id, db): resource = db.query(database.Paper).filter_by(id=id).first() if resource: return tools.APIResponse(tools.pretty_json({ - "data": resource.json_api_repr() + "data": resource.json_api_repr(db) })) return bottle.HTTPError(404, "Not found") @@ -152,12 +152,111 @@ def fetch_relationship(id, name, db): "data": [ ] } - if reversed: - relationships = resource.related_by + # Tags are handled differently + if name == "tags": + for t in resource.tags: + response["data"].append({ + "type": name, + "id": t.id + }) else: - relationships = resource.related_to - for r in relationships: - if r.relationship.name == name: - response["data"].append({"type": name, "id": r.right_id}) + if reversed: + relationships = resource.related_by + else: + relationships = resource.related_to + for r in relationships: + if r.relationship.name == name: + response["data"].append({"type": name, "id": r.right_id}) return tools.APIResponse(tools.pretty_json(response)) return bottle.HTTPError(404, "Not found") + + +def fetch_tags(db): + """ + Fetch all matching tags. + + .. code-block:: bash + + GET /tags + Accept: application/vnd.api+json + + + Filtering is possible using ``id=ID``, ``name=NAME`` or any combination of + these GET parameters. Other parameters are ignored. + + + .. code-block:: json + + { + "data": [ + { + "type": "tags", + "id": 1, + "attributes": { + "name": "foobar", + }, + "links": { + "self": "/tags/1" + } + } + ] + } + + :param db: A database session, injected by the ``Bottle`` plugin. + :returns: An ``HTTPResponse``. + """ + filters = {k: bottle.request.params[k] + for k in bottle.request.params + if k in ["id", "name"]} + resources = db.query(database.Tags).filter_by(**filters).all() + if resources: + return tools.APIResponse(tools.pretty_json({ + "data": [resource.json_api_repr() for resource in resources] + })) + return bottle.HTTPError(404, "Not found") + + +def fetch_tags_by_id(id, db): + """ + Fetch a tag identified by its internal id. + + .. code-block:: bash + + GET /tag/1 + Accept: application/vnd.api+json + + + .. code-block:: json + + { + "data": { + "type": "papers", + "id": 1, + "attributes": { + "doi": "10.1126/science.1252319", + "arxiv_id": "1401.2910" + }, + "links": { + "self": "/papers/1" + }, + "relationships": { + "cite": { + "links": { + "related": "/papers/1/relationships/cite" + } + }, + … + } + } + } + + :param id: The id of the requested tag. + :param db: A database session, injected by the ``Bottle`` plugin. + :returns: An ``HTTPResponse``. + """ + resource = db.query(database.Tags).filter_by(id=id).first() + if resource: + return tools.APIResponse(tools.pretty_json({ + "data": resource.json_api_repr() + })) + return bottle.HTTPError(404, "Not found") diff --git a/routes/post.py b/routes/post.py index be4ad3e..96f4988 100644 --- a/routes/post.py +++ b/routes/post.py @@ -12,7 +12,7 @@ from reference_fetcher import arxiv def create_paper(db): """ - Create a new resource identified by its DOI or arXiv eprint id. + Create a new paper identified by its DOI or arXiv eprint id. .. code-block:: bash @@ -76,7 +76,7 @@ def create_paper(db): # Return the resource response = { - "data": paper.json_api_repr() + "data": paper.json_api_repr(db) } # Import "cite" relation add_cite_relationship(paper, db) @@ -153,7 +153,6 @@ def add_cite_relationship(paper, db): :param db: A database session :returns: Nothing. """ - # TODO: Known bug: too many levels of recursion! # If paper is on arXiv if paper.arxiv_id is not None: # Get the cited DOIs @@ -169,7 +168,8 @@ def add_cite_relationship(paper, db): # If paper does not exist in db, add it right_paper = create_by_doi(doi, db) # Update cite relationship for this paper, recursively - add_cite_relationship(right_paper, db) + # TODO: Known bug: too many levels of recursion! + # add_cite_relationship(right_paper, db) # Update the relationships update_relationship_backend(paper.id, right_paper.id, "cite", db) # If paper is not on arXiv, nothing to do @@ -216,10 +216,21 @@ def update_relationships(id, name, db): return bottle.HTTPError(403, "Forbidden") # Update all the relationships for i in data: - updated = update_relationship_backend(id, i["id"], name, db) - if updated is None: - # An error occurred => 403 - return bottle.HTTPError(403, "Forbidden") + if i["type"] == "tags": + # Handle tags separately + tag = db.query(database.Tag).filter_by(id=i["id"]).first() + paper = db.query(database.Paper).filter_by(id=id).first() + if paper is None or tag is None: + # An error occurred => 403 + return bottle.HTTPError(403, "Forbidden") + paper.tags.append(tag) + db.add(paper) + db.flush() + else: + updated = update_relationship_backend(id, i["id"], name, db) + if updated is None: + # An error occurred => 403 + return bottle.HTTPError(403, "Forbidden") # Return an empty 204 on success return tools.APIResponse(status=204, body="") @@ -246,7 +257,7 @@ def update_relationship_backend(left_id, right_id, name, db): db.add(relationship) db.flush() # Update the relationship - a = database.Association(relationship_id=relationship.id) + a = database.RelationshipAssociation(relationship_id=relationship.id) a.right_paper = right_paper left_paper.related_to.append(a) try: @@ -257,3 +268,70 @@ def update_relationship_backend(left_id, right_id, name, db): db.rollback() return None return left_paper + + +def create_tag(db): + """ + Create a new tag. + + .. code-block:: bash + + POST /tags + Content-Type: application/vnd.api+json + Accept: application/vnd.api+json + + { + "data": { + "name": "foobar", + } + } + + + .. code-block:: json + + { + "data": { + "type": "tags", + "id": 1, + "attributes": { + "name": "foobar", + }, + "links": { + "self": "/tags/1" + } + } + } + + :param db: A database session, injected by the ``Bottle`` plugin. + :returns: An ``HTTPResponse``. + """ + data = json.loads(bottle.request.body.read().decode("utf-8")) + # Validate the request + if("data" not in data or + "type" not in data["data"] or + data["data"]["type"] != "tags" or + "name" not in data["data"]): + return bottle.HTTPError(403, "Forbidden") + + data = data["data"] + + tag = database.Tag(name=data["name"]) + + # Add it to the database + try: + db.add(tag) + db.flush() + except IntegrityError: + # Unique constraint violation, paper already exists + db.rollback() + return bottle.HTTPError(409, "Conflict") + + # Return the resource + response = { + "data": tag.json_api_repr() + } + # Return 200 with the correct body + headers = {"Location": "/tags/%d" % (tag.id,)} + return tools.APIResponse(status=200, + body=tools.pretty_json(response), + headers=headers)