Add a queue to store papers waiting for citation processing

* Also add a delete endpoint to delete a tag.
* Various fixes in the code.
* Better doc.
This commit is contained in:
Lucas Verney 2015-12-26 18:01:30 +01:00
parent 4a67e974b8
commit ffe454a558
6 changed files with 146 additions and 25 deletions

View File

@ -379,6 +379,16 @@ Accept: application/vnd.api+json
Response is empty HTTP 204. Response is empty HTTP 204.
### Delete a tag
```
DELETE /tags/1
Accept: application/vnd.api+json
```
Response is empty HTTP 204.
### Delete a relationship between two papers ### Delete a relationship between two papers
``` ```

View File

@ -35,15 +35,12 @@ class RelationshipAssociation(Base):
ondelete="CASCADE")) ondelete="CASCADE"))
right_paper = sqlalchemy_relationship("Paper", right_paper = sqlalchemy_relationship("Paper",
foreign_keys=right_id, foreign_keys=right_id,
back_populates="related_by", back_populates="related_by")
passive_deletes=True) relationship = sqlalchemy_relationship("Relationship")
relationship = sqlalchemy_relationship("Relationship",
passive_deletes=True)
left_paper = sqlalchemy_relationship("Paper", left_paper = sqlalchemy_relationship("Paper",
foreign_keys=left_id, foreign_keys=left_id,
back_populates="related_to", back_populates="related_to")
passive_deletes=True)
tag_association_table = Table( tag_association_table = Table(
'tag_association', Base.metadata, 'tag_association', Base.metadata,
@ -56,7 +53,7 @@ class Paper(Base):
__tablename__ = "papers" __tablename__ = "papers"
id = Column(Integer, primary_key=True) id = Column(Integer, primary_key=True)
doi = Column(String(), nullable=True, unique=True) doi = Column(String(), nullable=True, unique=True)
arxiv_id = Column(String(25), nullable=True, unique=True) arxiv_id = Column(String(30), nullable=True, unique=True)
# related_to are papers related to this paper (this_paper R …) # related_to are papers related to this paper (this_paper R …)
related_to = sqlalchemy_relationship("RelationshipAssociation", related_to = sqlalchemy_relationship("RelationshipAssociation",
foreign_keys="RelationshipAssociation.left_id", foreign_keys="RelationshipAssociation.left_id",
@ -70,7 +67,8 @@ class Paper(Base):
# Tags relationship # Tags relationship
tags = sqlalchemy_relationship("Tag", tags = sqlalchemy_relationship("Tag",
secondary=tag_association_table, secondary=tag_association_table,
backref="papers") backref="papers",
passive_deletes=True)
def __repr__(self): def __repr__(self):
return "<Paper(id='%d', doi='%s', arxiv_id='%s')>" % ( return "<Paper(id='%d', doi='%s', arxiv_id='%s')>" % (
@ -142,3 +140,12 @@ class Tag(Base):
"self": "/tags/%d" % (self.id,) "self": "/tags/%d" % (self.id,)
} }
} }
class CitationProcessingQueue(Base):
__tablename__ = "citationprocessingqueue"
id = Column(Integer, primary_key=True)
paper_id = Column(Integer,
ForeignKey('papers.id', ondelete="CASCADE"),
unique=True)
paper = sqlalchemy_relationship("Paper")

12
main.py
View File

@ -2,6 +2,7 @@
import bottle import bottle
from bottle.ext import sqlalchemy from bottle.ext import sqlalchemy
from sqlalchemy import create_engine from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import config import config
import database import database
@ -10,6 +11,8 @@ import tools
# Initialize db and include the SQLAlchemy plugin in bottle # Initialize db and include the SQLAlchemy plugin in bottle
engine = create_engine('sqlite:///%s' % (config.database,), echo=True) engine = create_engine('sqlite:///%s' % (config.database,), echo=True)
create_session = sessionmaker(bind=engine)
database.Base.metadata.create_all(engine)
app = bottle.Bottle() app = bottle.Bottle()
plugin = sqlalchemy.Plugin( plugin = sqlalchemy.Plugin(
@ -21,13 +24,15 @@ plugin = sqlalchemy.Plugin(
keyword='db', keyword='db',
# If it is true, execute `metadata.create_all(engine)` when plugin is # If it is true, execute `metadata.create_all(engine)` when plugin is
# applied (default False). # applied (default False).
create=True, create=False,
# If it is true, plugin commit changes after route is executed (default # If it is true, plugin commit changes after route is executed (default
# True). # True).
commit=True, commit=True,
# If it is true and keyword is not defined, plugin uses **kwargs argument # If it is true and keyword is not defined, plugin uses **kwargs argument
# to inject session database (default False). # to inject session database (default False).
use_kwargs=False use_kwargs=False,
# Create session method
create_session=create_session
) )
app.install(plugin) app.install(plugin)
@ -53,6 +58,8 @@ app.route("/papers/<id:int>/relationships/<name>", method="DELETE",
app.get("/tags", callback=routes.get.fetch_tags) app.get("/tags", callback=routes.get.fetch_tags)
app.get("/tags/<id:int>", callback=routes.get.fetch_tags_by_id) app.get("/tags/<id:int>", callback=routes.get.fetch_tags_by_id)
app.route("/tags/<id:int>", method="DELETE",
callback=routes.delete.delete_tag)
app.post("/papers", callback=routes.post.create_paper) app.post("/papers", callback=routes.post.create_paper)
@ -66,4 +73,5 @@ app.route("/papers/<id:int>/relationships/<name>", method="PATCH",
if __name__ == "__main__": if __name__ == "__main__":
routes.post.fetch_citations_in_queue(create_session)
app.run(host=config.host, port=config.port, debug=(not config.production)) app.run(host=config.host, port=config.port, debug=(not config.production))

View File

@ -97,3 +97,28 @@ def delete_relationship(id, name, db):
db.delete(relationship) db.delete(relationship)
# Return an empty 204 on success # Return an empty 204 on success
return tools.APIResponse(status=204, body="") return tools.APIResponse(status=204, body="")
def delete_tag(id, db):
"""
Delete a given tag.
.. code-block:: bash
DELETE /tags/1
Accept: application/vnd.api+json
.. code-block:: bash
HTTP 204
:param id: The id of the requested tag to be deleted.
:param db: A database session, injected by the ``Bottle`` plugin.
:returns: An ``HTTPResponse``.
"""
resource = db.query(database.Tag).filter_by(id=id).first()
if resource:
db.delete(resource)
return tools.APIResponse(status=204, body="")
return bottle.HTTPError(404, "Not found")

View File

@ -3,8 +3,10 @@ This file contains POST routes methods.
""" """
import bottle import bottle
import json import json
import threading
from sqlalchemy.exc import IntegrityError from sqlalchemy.exc import IntegrityError
import config
import database import database
import tools import tools
from reference_fetcher import arxiv from reference_fetcher import arxiv
@ -115,19 +117,19 @@ def create_by_doi(doi, db):
return paper return paper
def create_by_arxiv(arxiv, db): def create_by_arxiv(arxiv_id, db):
""" """
Create a new resource identified by its arXiv eprint ID, if it does not Create a new resource identified by its arXiv eprint ID, if it does not
exist. exist.
:param arxiv: The arXiv eprint ID. :param arxiv_id: The arXiv eprint ID.
:param db: A database session. :param db: A database session.
:returns: ``None`` if insertion failed, the ``Paper`` object otherwise. :returns: ``None`` if insertion failed, the ``Paper`` object otherwise.
""" """
paper = database.Paper(arxiv_id=arxiv) paper = database.Paper(arxiv_id=arxiv_id)
# Try to fetch an arXiv id # Try to fetch an arXiv id
doi = arxiv.get_doi(arxiv) doi = arxiv.get_doi(arxiv_id)
if doi: if doi:
paper.doi = doi paper.doi = doi
@ -156,20 +158,35 @@ def add_cite_relationship(paper, db):
# If paper is on arXiv # If paper is on arXiv
if paper.arxiv_id is not None: if paper.arxiv_id is not None:
# Get the cited DOIs # Get the cited DOIs
cited_dois = arxiv.get_cited_dois(paper.arxiv_id) cited_urls = arxiv.get_cited_dois(paper.arxiv_id)
# Filter out the ones that were not matched # Filter out the ones that were not matched
cited_dois = [cited_dois[k] cited_urls = [cited_urls[k]
for k in cited_dois if cited_dois[k] is not None] for k in cited_urls if cited_urls[k] is not None]
for doi in cited_dois: for url in cited_urls:
type, identifier = tools.get_identifier_from_url(url)
if type is None:
# No identifier found
continue
# Get the associated paper in the db # Get the associated paper in the db
right_paper = (db.query(database.Paper). right_paper = (db.query(database.Paper)
filter_by(doi=doi).first()) .filter(getattr(database.Paper, type) == identifier)
.first())
if right_paper is None: if right_paper is None:
# If paper does not exist in db, add it # If paper is not in db, add it
right_paper = create_by_doi(doi, db) if type == "doi":
# Update cite relationship for this paper, recursively right_paper = create_by_doi(identifier, db)
# TODO: Known bug: too many levels of recursion! elif type == "arxiv_id":
# add_cite_relationship(right_paper, db) right_paper = create_by_arxiv(identifier, db)
else:
continue
# Push this paper on the queue for update of cite relationships
queue = database.CitationProcessingQueue()
queue.paper = right_paper
try:
db.add(queue)
except IntegrityError:
# Unique constraint violation, relationship already exists
db.rollback()
# Update the relationships # Update the relationships
update_relationship_backend(paper.id, right_paper.id, "cite", db) update_relationship_backend(paper.id, right_paper.id, "cite", db)
# If paper is not on arXiv, nothing to do # If paper is not on arXiv, nothing to do
@ -177,6 +194,39 @@ def add_cite_relationship(paper, db):
return return
def fetch_citations_in_queue(create_session):
"""
Process the first item in the queue, waiting for citation processing.
i.. note::
Calls itself recursively after the time defined in ``config``, so
that queued articles are processed concurrently.
:param create_session: a ``SQLAlchemy`` ``sessionmaker``.
:returns: Nothing.
"""
# Get a db Session
db = create_session()
queued = db.query(database.CitationProcessingQueue).first()
if queued:
print("Processing citation relationships for %s." % (queued.paper,))
# Process this paper
add_cite_relationship(queued.paper, db)
# Remove this paper from queue
db.delete(queued)
# Commit to the database
try:
db.commit()
except:
db.rollback()
# Call this function again after a while
threading.Timer(
config.queue_polling_interval,
lambda: fetch_citations_in_queue(create_session)
).start()
def update_relationships(id, name, db): def update_relationships(id, name, db):
""" """
Update the relationships associated to a given paper. Update the relationships associated to a given paper.

View File

@ -18,6 +18,27 @@ def pretty_json(data):
separators=(',', ': ')) separators=(',', ': '))
def get_identifier_from_url(url):
"""
Get the identifier out of a DOI or arXiv URL.
:param url: An input URL.
:returns: A tuple ``(type, identifier)``. Returns ``(None, None)`` if \
could not match.
"""
type = None
identifier = None
if "dx.doi.org" in url:
type = "doi"
identifier = url[url.find("dx.doi.org") + 11:]
elif "arxiv.org/abs" in url:
type = "arxiv_id"
identifier = url[url.find("arxiv.org/abs/") + 14:]
return (type, identifier)
class APIResponse(bottle.HTTPResponse): class APIResponse(bottle.HTTPResponse):
""" """
Extend bottle.HTTPResponse base class to add Content-Type header. Extend bottle.HTTPResponse base class to add Content-Type header.