Add tags support

Also fix misc bugs

* Update README.md doc.
* Fix relationships not shown in JSON API for a given paper if relationship was
not referenced.
* Handle tags
This commit is contained in:
Lucas Verney 2015-12-26 01:25:38 +01:00
parent 7e80aacdb7
commit 4a67e974b8
6 changed files with 434 additions and 49 deletions

153
README.md
View File

@ -3,6 +3,30 @@ Metadata for arXiv
The goal of this repository is to provide a minimal API to put metadata on arXiv papers. The goal of this repository is to provide a minimal API to put metadata on arXiv papers.
Disclaimer: This code is not scalable nor ready to run in production. In
particular, it might be error-prone, and do not try to be resilient and keep
trace of errors. It is here as a proof of concept and to back [this
article](TODO) with some code. However, the `reference_fetcher` part is working
quite well, and was able to extract most of the references from arXiv papers I
tested it on. Note that it is quite long to run it on a paper, mainly due to
the latency in [Crossref API](http://search.crossref.org/).
## Special thanks
Under the hood, this code uses the wonderful [Crossref
API](http://search.crossref.org/) for reference parsing to DOI, which works
really well and with a very large index.
It also uses the [Dissemin API](http://beta.dissem.in/) in the
`reference_fetcher` to try to find Open access versions of referenced papers.
It works using the Open access [arXiv.org](http://arxiv.org) repository,
without which it would be really difficult to achieve similar thing, due to
paywalls and lack of sources. It also uses their
[API](http://arxiv.org/help/api) to fetch DOIs from arXiv id and conversely.
## Introduction ## Introduction
Most of the published scientific papers are availabe online, as preprints. For Most of the published scientific papers are availabe online, as preprints. For
@ -218,6 +242,95 @@ Accept: application/vnd.api+json
``` ```
### Get tags
```
GET /tags
Accept: application/vnd.api+json
```
Filtering is possible using ``id=ID``, ``name=NAME`` or any combination of
these GET parameters. Other parameters are ignored.
```json
{
"data": [
{
"type": "tags",
"id": 1,
"attributes": {
"name": "foobar",
},
"links": {
"self": "/tags/1"
}
}
]
}
```
### Get a tag by id
```
GET /tag/1
Accept: application/vnd.api+json
```
```json
{
"data": {
"type": "papers",
"id": 1,
"attributes": {
"doi": "10.1126/science.1252319",
"arxiv_id": "1401.2910"
},
"links": {
"self": "/papers/1"
},
"relationships": {
"cite": {
"links": {
"related": "/papers/1/relationships/cite"
}
},
}
}
}
```
### Create a tag
```
POST /tags
Content-Type: application/vnd.api+json
Accept: application/vnd.api+json
{
"data": {
"name": "foobar",
}
}
```
```json
{
"data": {
"type": "tags",
"id": 1,
"attributes": {
"name": "foobar",
},
"links": {
"self": "/tags/1"
}
}
}
```
### Create a relationship between two papers ### Create a relationship between two papers
``` ```
@ -236,6 +349,26 @@ Accept: application/vnd.api+json
Response is empty HTTP 204. Response is empty HTTP 204.
### Add a tag to a paper
```
POST /papers/1/relationships/tags
Content-Type: application/vnd.api+json
Accept: application/vnd.api+json
{
"data": [
{ "type": "tags", "id": "2" },
...
]
}
```
`id` is the id of the tag, which has to be created previously.
Response is empty HTTP 204.
### Delete a paper and associated relationships ### Delete a paper and associated relationships
``` ```
@ -264,6 +397,26 @@ Accept: application/vnd.api+json
Response is empty HTTP 204. Response is empty HTTP 204.
### Deleting a tag for a paper
```
DELETE /papers/1/relationships/tags
Content-Type: application/vnd.api+json
Accept: application/vnd.api+json
{
"data": [
{ "type": "tags", "id": "2" },
...
]
}
```
`id` is the id of the tag.
Response is empty HTTP 204.
## Associated library ## Associated library
`reference_fetcher` is a module you can use to: `reference_fetcher` is a module you can use to:

View File

@ -4,7 +4,7 @@ This file contains the database schema in SQLAlchemy format.
import sqlite3 import sqlite3
from sqlalchemy import event from sqlalchemy import event
from sqlalchemy import Column, ForeignKey, Integer, String from sqlalchemy import Column, ForeignKey, Integer, String, Table
from sqlalchemy.engine import Engine from sqlalchemy.engine import Engine
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship as sqlalchemy_relationship from sqlalchemy.orm import relationship as sqlalchemy_relationship
@ -24,9 +24,9 @@ def set_sqlite_pragma(dbapi_connection, connection_record):
cursor.close() cursor.close()
class Association(Base): class RelationshipAssociation(Base):
# Relationships are to be read "left RELATION right" # Relationships are to be read "left RELATION right"
__tablename__ = "association" __tablename__ = "relationship_association"
id = Column(Integer, primary_key=True) id = Column(Integer, primary_key=True)
left_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE")) left_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE"))
right_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE")) right_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE"))
@ -45,6 +45,12 @@ class Association(Base):
back_populates="related_to", back_populates="related_to",
passive_deletes=True) passive_deletes=True)
tag_association_table = Table(
'tag_association', Base.metadata,
Column('paper_id', Integer, ForeignKey('papers.id', ondelete="CASCADE")),
Column('tag_id', Integer, ForeignKey('tags.id', ondelete="CASCADE"))
)
class Paper(Base): class Paper(Base):
__tablename__ = "papers" __tablename__ = "papers"
@ -52,15 +58,19 @@ class Paper(Base):
doi = Column(String(), nullable=True, unique=True) doi = Column(String(), nullable=True, unique=True)
arxiv_id = Column(String(25), nullable=True, unique=True) arxiv_id = Column(String(25), nullable=True, unique=True)
# related_to are papers related to this paper (this_paper R …) # related_to are papers related to this paper (this_paper R …)
related_to = sqlalchemy_relationship("Association", related_to = sqlalchemy_relationship("RelationshipAssociation",
foreign_keys="Association.left_id", foreign_keys="RelationshipAssociation.left_id",
back_populates="left_paper", back_populates="left_paper",
passive_deletes=True) passive_deletes=True)
# related_by are papers referenced by this paper (… R this_paper) # related_by are papers referenced by this paper (… R this_paper)
related_by = sqlalchemy_relationship("Association", related_by = sqlalchemy_relationship("RelationshipAssociation",
foreign_keys="Association.right_id", foreign_keys="RelationshipAssociation.right_id",
back_populates="right_paper", back_populates="right_paper",
passive_deletes=True) passive_deletes=True)
# Tags relationship
tags = sqlalchemy_relationship("Tag",
secondary=tag_association_table,
backref="papers")
def __repr__(self): def __repr__(self):
return "<Paper(id='%d', doi='%s', arxiv_id='%s')>" % ( return "<Paper(id='%d', doi='%s', arxiv_id='%s')>" % (
@ -69,11 +79,27 @@ class Paper(Base):
self.arxiv_id, self.arxiv_id,
) )
def json_api_repr(self): def json_api_repr(self, db):
""" """
Dict to dump for the JSON API. Dict to dump for the JSON API.
""" """
relationships = [a.relationship.name for a in self.related_to] relationships = [i.name for i in db.query(Relationship).all()]
relationships_dict = {
k: {
"links": {
"related": (
"/papers/%d/relationships/%s?reverse={reverse}" %
(self.id, k)
)
}
}
for k in relationships
}
relationships_dict["tags"] = {
"links": {
"related": "/papers/%d/relationships/tags" % (self.id,)
}
}
return { return {
"types": self.__tablename__, "types": self.__tablename__,
"id": self.id, "id": self.id,
@ -84,17 +110,7 @@ class Paper(Base):
"links": { "links": {
"self": "/papers/%d" % (self.id,) "self": "/papers/%d" % (self.id,)
}, },
"relationships": { "relationships": relationships_dict
k: {
"links": {
"related": (
"/papers/%d/relationships/%s?reverse={reverse}" %
(self.id, k)
)
}
}
for k in relationships
}
} }
@ -102,6 +118,27 @@ class Relationship(Base):
__tablename__ = "relationships" __tablename__ = "relationships"
id = Column(Integer, primary_key=True) id = Column(Integer, primary_key=True)
name = Column(String(), unique=True) name = Column(String(), unique=True)
associations = sqlalchemy_relationship("Association", associations = sqlalchemy_relationship("RelationshipAssociation",
back_populates="relationship", back_populates="relationship",
passive_deletes=True) passive_deletes=True)
class Tag(Base):
__tablename__ = "tags"
id = Column(Integer, primary_key=True)
name = Column(String(), unique=True)
def json_api_repr(self):
"""
Dict to dump for the JSON API.
"""
return {
"types": self.__tablename__,
"id": self.id,
"attributes": {
"name": self.name,
},
"links": {
"self": "/tags/%d" % (self.id,)
}
}

View File

@ -41,7 +41,7 @@ def index():
})) }))
app.get("/papers", callback=routes.get.fetch_papers) app.get("/papers", callback=routes.get.fetch_papers)
app.get("/papers/<id:int>", callback=routes.get.fetch_by_id) app.get("/papers/<id:int>", callback=routes.get.fetch_papers_by_id)
app.get("/papers/<id:int>/relationships/<name>", app.get("/papers/<id:int>/relationships/<name>",
callback=routes.get.fetch_relationship) callback=routes.get.fetch_relationship)
app.get("/papers/<id:int>/<name>", app.get("/papers/<id:int>/<name>",
@ -51,8 +51,12 @@ app.route("/papers/<id:int>", method="DELETE",
app.route("/papers/<id:int>/relationships/<name>", method="DELETE", app.route("/papers/<id:int>/relationships/<name>", method="DELETE",
callback=routes.delete.delete_relationship) callback=routes.delete.delete_relationship)
app.get("/tags", callback=routes.get.fetch_tags)
app.get("/tags/<id:int>", callback=routes.get.fetch_tags_by_id)
app.post("/papers", callback=routes.post.create_paper) app.post("/papers", callback=routes.post.create_paper)
app.post("/tags", callback=routes.post.create_tag)
app.post("/papers/<id:int>/relationships/<name>", app.post("/papers/<id:int>/relationships/<name>",
callback=routes.post.update_relationships) callback=routes.post.update_relationships)

View File

@ -73,13 +73,27 @@ def delete_relationship(id, name, db):
return bottle.HTTPError(403, "Forbidden") return bottle.HTTPError(403, "Forbidden")
# Delete all the requested relationships # Delete all the requested relationships
for i in data: for i in data:
relationship = (db.query(database.Association) if i["type"] == "tags":
.filter_by(left_id=id, right_id=i["id"]) # Handle tags separately
.filter(database.Relationship.name == name) tag = db.query(database.Tag).filter_by(id=i["id"]).first()
.first()) paper = db.query(database.Paper).filter_by(id=id).first()
if relationship is None: if paper is None or tag is None:
# An error occurred => 403 # An error occurred => 403
return bottle.HTTPError(403, "Forbidden") return bottle.HTTPError(403, "Forbidden")
db.delete(relationship) try:
paper.tags.remove(tag)
except ValueError:
# An error occurred => 403
return bottle.HTTPError(403, "Forbidden")
db.flush()
else:
relationship = (db.query(database.RelationshipAssociation)
.filter_by(left_id=id, right_id=i["id"])
.filter(database.Relationship.name == name)
.first())
if relationship is None:
# An error occurred => 403
return bottle.HTTPError(403, "Forbidden")
db.delete(relationship)
# Return an empty 204 on success # Return an empty 204 on success
return tools.APIResponse(status=204, body="") return tools.APIResponse(status=204, body="")

View File

@ -56,14 +56,14 @@ def fetch_papers(db):
resources = db.query(database.Paper).filter_by(**filters).all() resources = db.query(database.Paper).filter_by(**filters).all()
if resources: if resources:
return tools.APIResponse(tools.pretty_json({ return tools.APIResponse(tools.pretty_json({
"data": [resource.json_api_repr() for resource in resources] "data": [resource.json_api_repr(db) for resource in resources]
})) }))
return bottle.HTTPError(404, "Not found") return bottle.HTTPError(404, "Not found")
def fetch_by_id(id, db): def fetch_papers_by_id(id, db):
""" """
Fetch a resource identified by its internal id. Fetch a paper identified by its internal id.
.. code-block:: bash .. code-block:: bash
@ -102,7 +102,7 @@ def fetch_by_id(id, db):
resource = db.query(database.Paper).filter_by(id=id).first() resource = db.query(database.Paper).filter_by(id=id).first()
if resource: if resource:
return tools.APIResponse(tools.pretty_json({ return tools.APIResponse(tools.pretty_json({
"data": resource.json_api_repr() "data": resource.json_api_repr(db)
})) }))
return bottle.HTTPError(404, "Not found") return bottle.HTTPError(404, "Not found")
@ -152,12 +152,111 @@ def fetch_relationship(id, name, db):
"data": [ "data": [
] ]
} }
if reversed: # Tags are handled differently
relationships = resource.related_by if name == "tags":
for t in resource.tags:
response["data"].append({
"type": name,
"id": t.id
})
else: else:
relationships = resource.related_to if reversed:
for r in relationships: relationships = resource.related_by
if r.relationship.name == name: else:
response["data"].append({"type": name, "id": r.right_id}) relationships = resource.related_to
for r in relationships:
if r.relationship.name == name:
response["data"].append({"type": name, "id": r.right_id})
return tools.APIResponse(tools.pretty_json(response)) return tools.APIResponse(tools.pretty_json(response))
return bottle.HTTPError(404, "Not found") return bottle.HTTPError(404, "Not found")
def fetch_tags(db):
"""
Fetch all matching tags.
.. code-block:: bash
GET /tags
Accept: application/vnd.api+json
Filtering is possible using ``id=ID``, ``name=NAME`` or any combination of
these GET parameters. Other parameters are ignored.
.. code-block:: json
{
"data": [
{
"type": "tags",
"id": 1,
"attributes": {
"name": "foobar",
},
"links": {
"self": "/tags/1"
}
}
]
}
:param db: A database session, injected by the ``Bottle`` plugin.
:returns: An ``HTTPResponse``.
"""
filters = {k: bottle.request.params[k]
for k in bottle.request.params
if k in ["id", "name"]}
resources = db.query(database.Tags).filter_by(**filters).all()
if resources:
return tools.APIResponse(tools.pretty_json({
"data": [resource.json_api_repr() for resource in resources]
}))
return bottle.HTTPError(404, "Not found")
def fetch_tags_by_id(id, db):
"""
Fetch a tag identified by its internal id.
.. code-block:: bash
GET /tag/1
Accept: application/vnd.api+json
.. code-block:: json
{
"data": {
"type": "papers",
"id": 1,
"attributes": {
"doi": "10.1126/science.1252319",
"arxiv_id": "1401.2910"
},
"links": {
"self": "/papers/1"
},
"relationships": {
"cite": {
"links": {
"related": "/papers/1/relationships/cite"
}
},
}
}
}
:param id: The id of the requested tag.
:param db: A database session, injected by the ``Bottle`` plugin.
:returns: An ``HTTPResponse``.
"""
resource = db.query(database.Tags).filter_by(id=id).first()
if resource:
return tools.APIResponse(tools.pretty_json({
"data": resource.json_api_repr()
}))
return bottle.HTTPError(404, "Not found")

View File

@ -12,7 +12,7 @@ from reference_fetcher import arxiv
def create_paper(db): def create_paper(db):
""" """
Create a new resource identified by its DOI or arXiv eprint id. Create a new paper identified by its DOI or arXiv eprint id.
.. code-block:: bash .. code-block:: bash
@ -76,7 +76,7 @@ def create_paper(db):
# Return the resource # Return the resource
response = { response = {
"data": paper.json_api_repr() "data": paper.json_api_repr(db)
} }
# Import "cite" relation # Import "cite" relation
add_cite_relationship(paper, db) add_cite_relationship(paper, db)
@ -153,7 +153,6 @@ def add_cite_relationship(paper, db):
:param db: A database session :param db: A database session
:returns: Nothing. :returns: Nothing.
""" """
# TODO: Known bug: too many levels of recursion!
# If paper is on arXiv # If paper is on arXiv
if paper.arxiv_id is not None: if paper.arxiv_id is not None:
# Get the cited DOIs # Get the cited DOIs
@ -169,7 +168,8 @@ def add_cite_relationship(paper, db):
# If paper does not exist in db, add it # If paper does not exist in db, add it
right_paper = create_by_doi(doi, db) right_paper = create_by_doi(doi, db)
# Update cite relationship for this paper, recursively # Update cite relationship for this paper, recursively
add_cite_relationship(right_paper, db) # TODO: Known bug: too many levels of recursion!
# add_cite_relationship(right_paper, db)
# Update the relationships # Update the relationships
update_relationship_backend(paper.id, right_paper.id, "cite", db) update_relationship_backend(paper.id, right_paper.id, "cite", db)
# If paper is not on arXiv, nothing to do # If paper is not on arXiv, nothing to do
@ -216,10 +216,21 @@ def update_relationships(id, name, db):
return bottle.HTTPError(403, "Forbidden") return bottle.HTTPError(403, "Forbidden")
# Update all the relationships # Update all the relationships
for i in data: for i in data:
updated = update_relationship_backend(id, i["id"], name, db) if i["type"] == "tags":
if updated is None: # Handle tags separately
# An error occurred => 403 tag = db.query(database.Tag).filter_by(id=i["id"]).first()
return bottle.HTTPError(403, "Forbidden") paper = db.query(database.Paper).filter_by(id=id).first()
if paper is None or tag is None:
# An error occurred => 403
return bottle.HTTPError(403, "Forbidden")
paper.tags.append(tag)
db.add(paper)
db.flush()
else:
updated = update_relationship_backend(id, i["id"], name, db)
if updated is None:
# An error occurred => 403
return bottle.HTTPError(403, "Forbidden")
# Return an empty 204 on success # Return an empty 204 on success
return tools.APIResponse(status=204, body="") return tools.APIResponse(status=204, body="")
@ -246,7 +257,7 @@ def update_relationship_backend(left_id, right_id, name, db):
db.add(relationship) db.add(relationship)
db.flush() db.flush()
# Update the relationship # Update the relationship
a = database.Association(relationship_id=relationship.id) a = database.RelationshipAssociation(relationship_id=relationship.id)
a.right_paper = right_paper a.right_paper = right_paper
left_paper.related_to.append(a) left_paper.related_to.append(a)
try: try:
@ -257,3 +268,70 @@ def update_relationship_backend(left_id, right_id, name, db):
db.rollback() db.rollback()
return None return None
return left_paper return left_paper
def create_tag(db):
"""
Create a new tag.
.. code-block:: bash
POST /tags
Content-Type: application/vnd.api+json
Accept: application/vnd.api+json
{
"data": {
"name": "foobar",
}
}
.. code-block:: json
{
"data": {
"type": "tags",
"id": 1,
"attributes": {
"name": "foobar",
},
"links": {
"self": "/tags/1"
}
}
}
:param db: A database session, injected by the ``Bottle`` plugin.
:returns: An ``HTTPResponse``.
"""
data = json.loads(bottle.request.body.read().decode("utf-8"))
# Validate the request
if("data" not in data or
"type" not in data["data"] or
data["data"]["type"] != "tags" or
"name" not in data["data"]):
return bottle.HTTPError(403, "Forbidden")
data = data["data"]
tag = database.Tag(name=data["name"])
# Add it to the database
try:
db.add(tag)
db.flush()
except IntegrityError:
# Unique constraint violation, paper already exists
db.rollback()
return bottle.HTTPError(409, "Conflict")
# Return the resource
response = {
"data": tag.json_api_repr()
}
# Return 200 with the correct body
headers = {"Location": "/tags/%d" % (tag.id,)}
return tools.APIResponse(status=200,
body=tools.pretty_json(response),
headers=headers)