Add tags support

Also fix misc bugs

* Update README.md doc.
* Fix relationships not shown in JSON API for a given paper if relationship was
not referenced.
* Handle tags
This commit is contained in:
Lucas Verney 2015-12-26 01:25:38 +01:00
parent 7e80aacdb7
commit 4a67e974b8
6 changed files with 434 additions and 49 deletions

153
README.md
View File

@ -3,6 +3,30 @@ Metadata for arXiv
The goal of this repository is to provide a minimal API to put metadata on arXiv papers.
Disclaimer: This code is not scalable nor ready to run in production. In
particular, it might be error-prone, and do not try to be resilient and keep
trace of errors. It is here as a proof of concept and to back [this
article](TODO) with some code. However, the `reference_fetcher` part is working
quite well, and was able to extract most of the references from arXiv papers I
tested it on. Note that it is quite long to run it on a paper, mainly due to
the latency in [Crossref API](http://search.crossref.org/).
## Special thanks
Under the hood, this code uses the wonderful [Crossref
API](http://search.crossref.org/) for reference parsing to DOI, which works
really well and with a very large index.
It also uses the [Dissemin API](http://beta.dissem.in/) in the
`reference_fetcher` to try to find Open access versions of referenced papers.
It works using the Open access [arXiv.org](http://arxiv.org) repository,
without which it would be really difficult to achieve similar thing, due to
paywalls and lack of sources. It also uses their
[API](http://arxiv.org/help/api) to fetch DOIs from arXiv id and conversely.
## Introduction
Most of the published scientific papers are availabe online, as preprints. For
@ -218,6 +242,95 @@ Accept: application/vnd.api+json
```
### Get tags
```
GET /tags
Accept: application/vnd.api+json
```
Filtering is possible using ``id=ID``, ``name=NAME`` or any combination of
these GET parameters. Other parameters are ignored.
```json
{
"data": [
{
"type": "tags",
"id": 1,
"attributes": {
"name": "foobar",
},
"links": {
"self": "/tags/1"
}
}
]
}
```
### Get a tag by id
```
GET /tag/1
Accept: application/vnd.api+json
```
```json
{
"data": {
"type": "papers",
"id": 1,
"attributes": {
"doi": "10.1126/science.1252319",
"arxiv_id": "1401.2910"
},
"links": {
"self": "/papers/1"
},
"relationships": {
"cite": {
"links": {
"related": "/papers/1/relationships/cite"
}
},
}
}
}
```
### Create a tag
```
POST /tags
Content-Type: application/vnd.api+json
Accept: application/vnd.api+json
{
"data": {
"name": "foobar",
}
}
```
```json
{
"data": {
"type": "tags",
"id": 1,
"attributes": {
"name": "foobar",
},
"links": {
"self": "/tags/1"
}
}
}
```
### Create a relationship between two papers
```
@ -236,6 +349,26 @@ Accept: application/vnd.api+json
Response is empty HTTP 204.
### Add a tag to a paper
```
POST /papers/1/relationships/tags
Content-Type: application/vnd.api+json
Accept: application/vnd.api+json
{
"data": [
{ "type": "tags", "id": "2" },
...
]
}
```
`id` is the id of the tag, which has to be created previously.
Response is empty HTTP 204.
### Delete a paper and associated relationships
```
@ -264,6 +397,26 @@ Accept: application/vnd.api+json
Response is empty HTTP 204.
### Deleting a tag for a paper
```
DELETE /papers/1/relationships/tags
Content-Type: application/vnd.api+json
Accept: application/vnd.api+json
{
"data": [
{ "type": "tags", "id": "2" },
...
]
}
```
`id` is the id of the tag.
Response is empty HTTP 204.
## Associated library
`reference_fetcher` is a module you can use to:

View File

@ -4,7 +4,7 @@ This file contains the database schema in SQLAlchemy format.
import sqlite3
from sqlalchemy import event
from sqlalchemy import Column, ForeignKey, Integer, String
from sqlalchemy import Column, ForeignKey, Integer, String, Table
from sqlalchemy.engine import Engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship as sqlalchemy_relationship
@ -24,9 +24,9 @@ def set_sqlite_pragma(dbapi_connection, connection_record):
cursor.close()
class Association(Base):
class RelationshipAssociation(Base):
# Relationships are to be read "left RELATION right"
__tablename__ = "association"
__tablename__ = "relationship_association"
id = Column(Integer, primary_key=True)
left_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE"))
right_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE"))
@ -45,6 +45,12 @@ class Association(Base):
back_populates="related_to",
passive_deletes=True)
tag_association_table = Table(
'tag_association', Base.metadata,
Column('paper_id', Integer, ForeignKey('papers.id', ondelete="CASCADE")),
Column('tag_id', Integer, ForeignKey('tags.id', ondelete="CASCADE"))
)
class Paper(Base):
__tablename__ = "papers"
@ -52,15 +58,19 @@ class Paper(Base):
doi = Column(String(), nullable=True, unique=True)
arxiv_id = Column(String(25), nullable=True, unique=True)
# related_to are papers related to this paper (this_paper R …)
related_to = sqlalchemy_relationship("Association",
foreign_keys="Association.left_id",
related_to = sqlalchemy_relationship("RelationshipAssociation",
foreign_keys="RelationshipAssociation.left_id",
back_populates="left_paper",
passive_deletes=True)
# related_by are papers referenced by this paper (… R this_paper)
related_by = sqlalchemy_relationship("Association",
foreign_keys="Association.right_id",
related_by = sqlalchemy_relationship("RelationshipAssociation",
foreign_keys="RelationshipAssociation.right_id",
back_populates="right_paper",
passive_deletes=True)
# Tags relationship
tags = sqlalchemy_relationship("Tag",
secondary=tag_association_table,
backref="papers")
def __repr__(self):
return "<Paper(id='%d', doi='%s', arxiv_id='%s')>" % (
@ -69,11 +79,27 @@ class Paper(Base):
self.arxiv_id,
)
def json_api_repr(self):
def json_api_repr(self, db):
"""
Dict to dump for the JSON API.
"""
relationships = [a.relationship.name for a in self.related_to]
relationships = [i.name for i in db.query(Relationship).all()]
relationships_dict = {
k: {
"links": {
"related": (
"/papers/%d/relationships/%s?reverse={reverse}" %
(self.id, k)
)
}
}
for k in relationships
}
relationships_dict["tags"] = {
"links": {
"related": "/papers/%d/relationships/tags" % (self.id,)
}
}
return {
"types": self.__tablename__,
"id": self.id,
@ -84,17 +110,7 @@ class Paper(Base):
"links": {
"self": "/papers/%d" % (self.id,)
},
"relationships": {
k: {
"links": {
"related": (
"/papers/%d/relationships/%s?reverse={reverse}" %
(self.id, k)
)
}
}
for k in relationships
}
"relationships": relationships_dict
}
@ -102,6 +118,27 @@ class Relationship(Base):
__tablename__ = "relationships"
id = Column(Integer, primary_key=True)
name = Column(String(), unique=True)
associations = sqlalchemy_relationship("Association",
associations = sqlalchemy_relationship("RelationshipAssociation",
back_populates="relationship",
passive_deletes=True)
class Tag(Base):
__tablename__ = "tags"
id = Column(Integer, primary_key=True)
name = Column(String(), unique=True)
def json_api_repr(self):
"""
Dict to dump for the JSON API.
"""
return {
"types": self.__tablename__,
"id": self.id,
"attributes": {
"name": self.name,
},
"links": {
"self": "/tags/%d" % (self.id,)
}
}

View File

@ -41,7 +41,7 @@ def index():
}))
app.get("/papers", callback=routes.get.fetch_papers)
app.get("/papers/<id:int>", callback=routes.get.fetch_by_id)
app.get("/papers/<id:int>", callback=routes.get.fetch_papers_by_id)
app.get("/papers/<id:int>/relationships/<name>",
callback=routes.get.fetch_relationship)
app.get("/papers/<id:int>/<name>",
@ -51,8 +51,12 @@ app.route("/papers/<id:int>", method="DELETE",
app.route("/papers/<id:int>/relationships/<name>", method="DELETE",
callback=routes.delete.delete_relationship)
app.get("/tags", callback=routes.get.fetch_tags)
app.get("/tags/<id:int>", callback=routes.get.fetch_tags_by_id)
app.post("/papers", callback=routes.post.create_paper)
app.post("/tags", callback=routes.post.create_tag)
app.post("/papers/<id:int>/relationships/<name>",
callback=routes.post.update_relationships)

View File

@ -73,13 +73,27 @@ def delete_relationship(id, name, db):
return bottle.HTTPError(403, "Forbidden")
# Delete all the requested relationships
for i in data:
relationship = (db.query(database.Association)
.filter_by(left_id=id, right_id=i["id"])
.filter(database.Relationship.name == name)
.first())
if relationship is None:
# An error occurred => 403
return bottle.HTTPError(403, "Forbidden")
db.delete(relationship)
if i["type"] == "tags":
# Handle tags separately
tag = db.query(database.Tag).filter_by(id=i["id"]).first()
paper = db.query(database.Paper).filter_by(id=id).first()
if paper is None or tag is None:
# An error occurred => 403
return bottle.HTTPError(403, "Forbidden")
try:
paper.tags.remove(tag)
except ValueError:
# An error occurred => 403
return bottle.HTTPError(403, "Forbidden")
db.flush()
else:
relationship = (db.query(database.RelationshipAssociation)
.filter_by(left_id=id, right_id=i["id"])
.filter(database.Relationship.name == name)
.first())
if relationship is None:
# An error occurred => 403
return bottle.HTTPError(403, "Forbidden")
db.delete(relationship)
# Return an empty 204 on success
return tools.APIResponse(status=204, body="")

View File

@ -56,14 +56,14 @@ def fetch_papers(db):
resources = db.query(database.Paper).filter_by(**filters).all()
if resources:
return tools.APIResponse(tools.pretty_json({
"data": [resource.json_api_repr() for resource in resources]
"data": [resource.json_api_repr(db) for resource in resources]
}))
return bottle.HTTPError(404, "Not found")
def fetch_by_id(id, db):
def fetch_papers_by_id(id, db):
"""
Fetch a resource identified by its internal id.
Fetch a paper identified by its internal id.
.. code-block:: bash
@ -102,7 +102,7 @@ def fetch_by_id(id, db):
resource = db.query(database.Paper).filter_by(id=id).first()
if resource:
return tools.APIResponse(tools.pretty_json({
"data": resource.json_api_repr()
"data": resource.json_api_repr(db)
}))
return bottle.HTTPError(404, "Not found")
@ -152,12 +152,111 @@ def fetch_relationship(id, name, db):
"data": [
]
}
if reversed:
relationships = resource.related_by
# Tags are handled differently
if name == "tags":
for t in resource.tags:
response["data"].append({
"type": name,
"id": t.id
})
else:
relationships = resource.related_to
for r in relationships:
if r.relationship.name == name:
response["data"].append({"type": name, "id": r.right_id})
if reversed:
relationships = resource.related_by
else:
relationships = resource.related_to
for r in relationships:
if r.relationship.name == name:
response["data"].append({"type": name, "id": r.right_id})
return tools.APIResponse(tools.pretty_json(response))
return bottle.HTTPError(404, "Not found")
def fetch_tags(db):
"""
Fetch all matching tags.
.. code-block:: bash
GET /tags
Accept: application/vnd.api+json
Filtering is possible using ``id=ID``, ``name=NAME`` or any combination of
these GET parameters. Other parameters are ignored.
.. code-block:: json
{
"data": [
{
"type": "tags",
"id": 1,
"attributes": {
"name": "foobar",
},
"links": {
"self": "/tags/1"
}
}
]
}
:param db: A database session, injected by the ``Bottle`` plugin.
:returns: An ``HTTPResponse``.
"""
filters = {k: bottle.request.params[k]
for k in bottle.request.params
if k in ["id", "name"]}
resources = db.query(database.Tags).filter_by(**filters).all()
if resources:
return tools.APIResponse(tools.pretty_json({
"data": [resource.json_api_repr() for resource in resources]
}))
return bottle.HTTPError(404, "Not found")
def fetch_tags_by_id(id, db):
"""
Fetch a tag identified by its internal id.
.. code-block:: bash
GET /tag/1
Accept: application/vnd.api+json
.. code-block:: json
{
"data": {
"type": "papers",
"id": 1,
"attributes": {
"doi": "10.1126/science.1252319",
"arxiv_id": "1401.2910"
},
"links": {
"self": "/papers/1"
},
"relationships": {
"cite": {
"links": {
"related": "/papers/1/relationships/cite"
}
},
}
}
}
:param id: The id of the requested tag.
:param db: A database session, injected by the ``Bottle`` plugin.
:returns: An ``HTTPResponse``.
"""
resource = db.query(database.Tags).filter_by(id=id).first()
if resource:
return tools.APIResponse(tools.pretty_json({
"data": resource.json_api_repr()
}))
return bottle.HTTPError(404, "Not found")

View File

@ -12,7 +12,7 @@ from reference_fetcher import arxiv
def create_paper(db):
"""
Create a new resource identified by its DOI or arXiv eprint id.
Create a new paper identified by its DOI or arXiv eprint id.
.. code-block:: bash
@ -76,7 +76,7 @@ def create_paper(db):
# Return the resource
response = {
"data": paper.json_api_repr()
"data": paper.json_api_repr(db)
}
# Import "cite" relation
add_cite_relationship(paper, db)
@ -153,7 +153,6 @@ def add_cite_relationship(paper, db):
:param db: A database session
:returns: Nothing.
"""
# TODO: Known bug: too many levels of recursion!
# If paper is on arXiv
if paper.arxiv_id is not None:
# Get the cited DOIs
@ -169,7 +168,8 @@ def add_cite_relationship(paper, db):
# If paper does not exist in db, add it
right_paper = create_by_doi(doi, db)
# Update cite relationship for this paper, recursively
add_cite_relationship(right_paper, db)
# TODO: Known bug: too many levels of recursion!
# add_cite_relationship(right_paper, db)
# Update the relationships
update_relationship_backend(paper.id, right_paper.id, "cite", db)
# If paper is not on arXiv, nothing to do
@ -216,10 +216,21 @@ def update_relationships(id, name, db):
return bottle.HTTPError(403, "Forbidden")
# Update all the relationships
for i in data:
updated = update_relationship_backend(id, i["id"], name, db)
if updated is None:
# An error occurred => 403
return bottle.HTTPError(403, "Forbidden")
if i["type"] == "tags":
# Handle tags separately
tag = db.query(database.Tag).filter_by(id=i["id"]).first()
paper = db.query(database.Paper).filter_by(id=id).first()
if paper is None or tag is None:
# An error occurred => 403
return bottle.HTTPError(403, "Forbidden")
paper.tags.append(tag)
db.add(paper)
db.flush()
else:
updated = update_relationship_backend(id, i["id"], name, db)
if updated is None:
# An error occurred => 403
return bottle.HTTPError(403, "Forbidden")
# Return an empty 204 on success
return tools.APIResponse(status=204, body="")
@ -246,7 +257,7 @@ def update_relationship_backend(left_id, right_id, name, db):
db.add(relationship)
db.flush()
# Update the relationship
a = database.Association(relationship_id=relationship.id)
a = database.RelationshipAssociation(relationship_id=relationship.id)
a.right_paper = right_paper
left_paper.related_to.append(a)
try:
@ -257,3 +268,70 @@ def update_relationship_backend(left_id, right_id, name, db):
db.rollback()
return None
return left_paper
def create_tag(db):
"""
Create a new tag.
.. code-block:: bash
POST /tags
Content-Type: application/vnd.api+json
Accept: application/vnd.api+json
{
"data": {
"name": "foobar",
}
}
.. code-block:: json
{
"data": {
"type": "tags",
"id": 1,
"attributes": {
"name": "foobar",
},
"links": {
"self": "/tags/1"
}
}
}
:param db: A database session, injected by the ``Bottle`` plugin.
:returns: An ``HTTPResponse``.
"""
data = json.loads(bottle.request.body.read().decode("utf-8"))
# Validate the request
if("data" not in data or
"type" not in data["data"] or
data["data"]["type"] != "tags" or
"name" not in data["data"]):
return bottle.HTTPError(403, "Forbidden")
data = data["data"]
tag = database.Tag(name=data["name"])
# Add it to the database
try:
db.add(tag)
db.flush()
except IntegrityError:
# Unique constraint violation, paper already exists
db.rollback()
return bottle.HTTPError(409, "Conflict")
# Return the resource
response = {
"data": tag.json_api_repr()
}
# Return 200 with the correct body
headers = {"Location": "/tags/%d" % (tag.id,)}
return tools.APIResponse(status=200,
body=tools.pretty_json(response),
headers=headers)