From ffe454a558f6c05feb07348406cda7e915e64bb0 Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Sat, 26 Dec 2015 18:01:30 +0100 Subject: [PATCH] Add a queue to store papers waiting for citation processing * Also add a delete endpoint to delete a tag. * Various fixes in the code. * Better doc. --- README.md | 10 ++++++ database.py | 23 +++++++++----- main.py | 12 ++++++-- routes/delete.py | 25 +++++++++++++++ routes/post.py | 80 +++++++++++++++++++++++++++++++++++++++--------- tools.py | 21 +++++++++++++ 6 files changed, 146 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index baea074..4cf0556 100644 --- a/README.md +++ b/README.md @@ -379,6 +379,16 @@ Accept: application/vnd.api+json Response is empty HTTP 204. +### Delete a tag + +``` +DELETE /tags/1 +Accept: application/vnd.api+json +``` + +Response is empty HTTP 204. + + ### Delete a relationship between two papers ``` diff --git a/database.py b/database.py index 8078c6c..cc4fc17 100644 --- a/database.py +++ b/database.py @@ -35,15 +35,12 @@ class RelationshipAssociation(Base): ondelete="CASCADE")) right_paper = sqlalchemy_relationship("Paper", foreign_keys=right_id, - back_populates="related_by", - passive_deletes=True) - relationship = sqlalchemy_relationship("Relationship", - passive_deletes=True) + back_populates="related_by") + relationship = sqlalchemy_relationship("Relationship") left_paper = sqlalchemy_relationship("Paper", foreign_keys=left_id, - back_populates="related_to", - passive_deletes=True) + back_populates="related_to") tag_association_table = Table( 'tag_association', Base.metadata, @@ -56,7 +53,7 @@ class Paper(Base): __tablename__ = "papers" id = Column(Integer, primary_key=True) doi = Column(String(), nullable=True, unique=True) - arxiv_id = Column(String(25), nullable=True, unique=True) + arxiv_id = Column(String(30), nullable=True, unique=True) # related_to are papers related to this paper (this_paper R …) related_to = sqlalchemy_relationship("RelationshipAssociation", foreign_keys="RelationshipAssociation.left_id", @@ -70,7 +67,8 @@ class Paper(Base): # Tags relationship tags = sqlalchemy_relationship("Tag", secondary=tag_association_table, - backref="papers") + backref="papers", + passive_deletes=True) def __repr__(self): return "" % ( @@ -142,3 +140,12 @@ class Tag(Base): "self": "/tags/%d" % (self.id,) } } + + +class CitationProcessingQueue(Base): + __tablename__ = "citationprocessingqueue" + id = Column(Integer, primary_key=True) + paper_id = Column(Integer, + ForeignKey('papers.id', ondelete="CASCADE"), + unique=True) + paper = sqlalchemy_relationship("Paper") diff --git a/main.py b/main.py index 7a8023c..a540f98 100755 --- a/main.py +++ b/main.py @@ -2,6 +2,7 @@ import bottle from bottle.ext import sqlalchemy from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker import config import database @@ -10,6 +11,8 @@ import tools # Initialize db and include the SQLAlchemy plugin in bottle engine = create_engine('sqlite:///%s' % (config.database,), echo=True) +create_session = sessionmaker(bind=engine) +database.Base.metadata.create_all(engine) app = bottle.Bottle() plugin = sqlalchemy.Plugin( @@ -21,13 +24,15 @@ plugin = sqlalchemy.Plugin( keyword='db', # If it is true, execute `metadata.create_all(engine)` when plugin is # applied (default False). - create=True, + create=False, # If it is true, plugin commit changes after route is executed (default # True). commit=True, # If it is true and keyword is not defined, plugin uses **kwargs argument # to inject session database (default False). - use_kwargs=False + use_kwargs=False, + # Create session method + create_session=create_session ) app.install(plugin) @@ -53,6 +58,8 @@ app.route("/papers//relationships/", method="DELETE", app.get("/tags", callback=routes.get.fetch_tags) app.get("/tags/", callback=routes.get.fetch_tags_by_id) +app.route("/tags/", method="DELETE", + callback=routes.delete.delete_tag) app.post("/papers", callback=routes.post.create_paper) @@ -66,4 +73,5 @@ app.route("/papers//relationships/", method="PATCH", if __name__ == "__main__": + routes.post.fetch_citations_in_queue(create_session) app.run(host=config.host, port=config.port, debug=(not config.production)) diff --git a/routes/delete.py b/routes/delete.py index d9b2c8e..3edff93 100644 --- a/routes/delete.py +++ b/routes/delete.py @@ -97,3 +97,28 @@ def delete_relationship(id, name, db): db.delete(relationship) # Return an empty 204 on success return tools.APIResponse(status=204, body="") + + +def delete_tag(id, db): + """ + Delete a given tag. + + .. code-block:: bash + + DELETE /tags/1 + Accept: application/vnd.api+json + + + .. code-block:: bash + + HTTP 204 + + :param id: The id of the requested tag to be deleted. + :param db: A database session, injected by the ``Bottle`` plugin. + :returns: An ``HTTPResponse``. + """ + resource = db.query(database.Tag).filter_by(id=id).first() + if resource: + db.delete(resource) + return tools.APIResponse(status=204, body="") + return bottle.HTTPError(404, "Not found") diff --git a/routes/post.py b/routes/post.py index 96f4988..420e60e 100644 --- a/routes/post.py +++ b/routes/post.py @@ -3,8 +3,10 @@ This file contains POST routes methods. """ import bottle import json +import threading from sqlalchemy.exc import IntegrityError +import config import database import tools from reference_fetcher import arxiv @@ -115,19 +117,19 @@ def create_by_doi(doi, db): return paper -def create_by_arxiv(arxiv, db): +def create_by_arxiv(arxiv_id, db): """ Create a new resource identified by its arXiv eprint ID, if it does not exist. - :param arxiv: The arXiv eprint ID. + :param arxiv_id: The arXiv eprint ID. :param db: A database session. :returns: ``None`` if insertion failed, the ``Paper`` object otherwise. """ - paper = database.Paper(arxiv_id=arxiv) + paper = database.Paper(arxiv_id=arxiv_id) # Try to fetch an arXiv id - doi = arxiv.get_doi(arxiv) + doi = arxiv.get_doi(arxiv_id) if doi: paper.doi = doi @@ -156,20 +158,35 @@ def add_cite_relationship(paper, db): # If paper is on arXiv if paper.arxiv_id is not None: # Get the cited DOIs - cited_dois = arxiv.get_cited_dois(paper.arxiv_id) + cited_urls = arxiv.get_cited_dois(paper.arxiv_id) # Filter out the ones that were not matched - cited_dois = [cited_dois[k] - for k in cited_dois if cited_dois[k] is not None] - for doi in cited_dois: + cited_urls = [cited_urls[k] + for k in cited_urls if cited_urls[k] is not None] + for url in cited_urls: + type, identifier = tools.get_identifier_from_url(url) + if type is None: + # No identifier found + continue # Get the associated paper in the db - right_paper = (db.query(database.Paper). - filter_by(doi=doi).first()) + right_paper = (db.query(database.Paper) + .filter(getattr(database.Paper, type) == identifier) + .first()) if right_paper is None: - # If paper does not exist in db, add it - right_paper = create_by_doi(doi, db) - # Update cite relationship for this paper, recursively - # TODO: Known bug: too many levels of recursion! - # add_cite_relationship(right_paper, db) + # If paper is not in db, add it + if type == "doi": + right_paper = create_by_doi(identifier, db) + elif type == "arxiv_id": + right_paper = create_by_arxiv(identifier, db) + else: + continue + # Push this paper on the queue for update of cite relationships + queue = database.CitationProcessingQueue() + queue.paper = right_paper + try: + db.add(queue) + except IntegrityError: + # Unique constraint violation, relationship already exists + db.rollback() # Update the relationships update_relationship_backend(paper.id, right_paper.id, "cite", db) # If paper is not on arXiv, nothing to do @@ -177,6 +194,39 @@ def add_cite_relationship(paper, db): return +def fetch_citations_in_queue(create_session): + """ + Process the first item in the queue, waiting for citation processing. + + i.. note:: + + Calls itself recursively after the time defined in ``config``, so + that queued articles are processed concurrently. + + :param create_session: a ``SQLAlchemy`` ``sessionmaker``. + :returns: Nothing. + """ + # Get a db Session + db = create_session() + queued = db.query(database.CitationProcessingQueue).first() + if queued: + print("Processing citation relationships for %s." % (queued.paper,)) + # Process this paper + add_cite_relationship(queued.paper, db) + # Remove this paper from queue + db.delete(queued) + # Commit to the database + try: + db.commit() + except: + db.rollback() + # Call this function again after a while + threading.Timer( + config.queue_polling_interval, + lambda: fetch_citations_in_queue(create_session) + ).start() + + def update_relationships(id, name, db): """ Update the relationships associated to a given paper. diff --git a/tools.py b/tools.py index 3221592..117834a 100644 --- a/tools.py +++ b/tools.py @@ -18,6 +18,27 @@ def pretty_json(data): separators=(',', ': ')) +def get_identifier_from_url(url): + """ + Get the identifier out of a DOI or arXiv URL. + + :param url: An input URL. + :returns: A tuple ``(type, identifier)``. Returns ``(None, None)`` if \ + could not match. + """ + type = None + identifier = None + + if "dx.doi.org" in url: + type = "doi" + identifier = url[url.find("dx.doi.org") + 11:] + elif "arxiv.org/abs" in url: + type = "arxiv_id" + identifier = url[url.find("arxiv.org/abs/") + 14:] + + return (type, identifier) + + class APIResponse(bottle.HTTPResponse): """ Extend bottle.HTTPResponse base class to add Content-Type header.