Add a queue to store papers waiting for citation processing

* Also add a delete endpoint to delete a tag.
* Various fixes in the code.
* Better doc.
This commit is contained in:
Lucas Verney 2015-12-26 18:01:30 +01:00
parent 4a67e974b8
commit ffe454a558
6 changed files with 146 additions and 25 deletions

View File

@ -379,6 +379,16 @@ Accept: application/vnd.api+json
Response is empty HTTP 204.
### Delete a tag
```
DELETE /tags/1
Accept: application/vnd.api+json
```
Response is empty HTTP 204.
### Delete a relationship between two papers
```

View File

@ -35,15 +35,12 @@ class RelationshipAssociation(Base):
ondelete="CASCADE"))
right_paper = sqlalchemy_relationship("Paper",
foreign_keys=right_id,
back_populates="related_by",
passive_deletes=True)
relationship = sqlalchemy_relationship("Relationship",
passive_deletes=True)
back_populates="related_by")
relationship = sqlalchemy_relationship("Relationship")
left_paper = sqlalchemy_relationship("Paper",
foreign_keys=left_id,
back_populates="related_to",
passive_deletes=True)
back_populates="related_to")
tag_association_table = Table(
'tag_association', Base.metadata,
@ -56,7 +53,7 @@ class Paper(Base):
__tablename__ = "papers"
id = Column(Integer, primary_key=True)
doi = Column(String(), nullable=True, unique=True)
arxiv_id = Column(String(25), nullable=True, unique=True)
arxiv_id = Column(String(30), nullable=True, unique=True)
# related_to are papers related to this paper (this_paper R …)
related_to = sqlalchemy_relationship("RelationshipAssociation",
foreign_keys="RelationshipAssociation.left_id",
@ -70,7 +67,8 @@ class Paper(Base):
# Tags relationship
tags = sqlalchemy_relationship("Tag",
secondary=tag_association_table,
backref="papers")
backref="papers",
passive_deletes=True)
def __repr__(self):
return "<Paper(id='%d', doi='%s', arxiv_id='%s')>" % (
@ -142,3 +140,12 @@ class Tag(Base):
"self": "/tags/%d" % (self.id,)
}
}
class CitationProcessingQueue(Base):
__tablename__ = "citationprocessingqueue"
id = Column(Integer, primary_key=True)
paper_id = Column(Integer,
ForeignKey('papers.id', ondelete="CASCADE"),
unique=True)
paper = sqlalchemy_relationship("Paper")

12
main.py
View File

@ -2,6 +2,7 @@
import bottle
from bottle.ext import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import config
import database
@ -10,6 +11,8 @@ import tools
# Initialize db and include the SQLAlchemy plugin in bottle
engine = create_engine('sqlite:///%s' % (config.database,), echo=True)
create_session = sessionmaker(bind=engine)
database.Base.metadata.create_all(engine)
app = bottle.Bottle()
plugin = sqlalchemy.Plugin(
@ -21,13 +24,15 @@ plugin = sqlalchemy.Plugin(
keyword='db',
# If it is true, execute `metadata.create_all(engine)` when plugin is
# applied (default False).
create=True,
create=False,
# If it is true, plugin commit changes after route is executed (default
# True).
commit=True,
# If it is true and keyword is not defined, plugin uses **kwargs argument
# to inject session database (default False).
use_kwargs=False
use_kwargs=False,
# Create session method
create_session=create_session
)
app.install(plugin)
@ -53,6 +58,8 @@ app.route("/papers/<id:int>/relationships/<name>", method="DELETE",
app.get("/tags", callback=routes.get.fetch_tags)
app.get("/tags/<id:int>", callback=routes.get.fetch_tags_by_id)
app.route("/tags/<id:int>", method="DELETE",
callback=routes.delete.delete_tag)
app.post("/papers", callback=routes.post.create_paper)
@ -66,4 +73,5 @@ app.route("/papers/<id:int>/relationships/<name>", method="PATCH",
if __name__ == "__main__":
routes.post.fetch_citations_in_queue(create_session)
app.run(host=config.host, port=config.port, debug=(not config.production))

View File

@ -97,3 +97,28 @@ def delete_relationship(id, name, db):
db.delete(relationship)
# Return an empty 204 on success
return tools.APIResponse(status=204, body="")
def delete_tag(id, db):
"""
Delete a given tag.
.. code-block:: bash
DELETE /tags/1
Accept: application/vnd.api+json
.. code-block:: bash
HTTP 204
:param id: The id of the requested tag to be deleted.
:param db: A database session, injected by the ``Bottle`` plugin.
:returns: An ``HTTPResponse``.
"""
resource = db.query(database.Tag).filter_by(id=id).first()
if resource:
db.delete(resource)
return tools.APIResponse(status=204, body="")
return bottle.HTTPError(404, "Not found")

View File

@ -3,8 +3,10 @@ This file contains POST routes methods.
"""
import bottle
import json
import threading
from sqlalchemy.exc import IntegrityError
import config
import database
import tools
from reference_fetcher import arxiv
@ -115,19 +117,19 @@ def create_by_doi(doi, db):
return paper
def create_by_arxiv(arxiv, db):
def create_by_arxiv(arxiv_id, db):
"""
Create a new resource identified by its arXiv eprint ID, if it does not
exist.
:param arxiv: The arXiv eprint ID.
:param arxiv_id: The arXiv eprint ID.
:param db: A database session.
:returns: ``None`` if insertion failed, the ``Paper`` object otherwise.
"""
paper = database.Paper(arxiv_id=arxiv)
paper = database.Paper(arxiv_id=arxiv_id)
# Try to fetch an arXiv id
doi = arxiv.get_doi(arxiv)
doi = arxiv.get_doi(arxiv_id)
if doi:
paper.doi = doi
@ -156,20 +158,35 @@ def add_cite_relationship(paper, db):
# If paper is on arXiv
if paper.arxiv_id is not None:
# Get the cited DOIs
cited_dois = arxiv.get_cited_dois(paper.arxiv_id)
cited_urls = arxiv.get_cited_dois(paper.arxiv_id)
# Filter out the ones that were not matched
cited_dois = [cited_dois[k]
for k in cited_dois if cited_dois[k] is not None]
for doi in cited_dois:
cited_urls = [cited_urls[k]
for k in cited_urls if cited_urls[k] is not None]
for url in cited_urls:
type, identifier = tools.get_identifier_from_url(url)
if type is None:
# No identifier found
continue
# Get the associated paper in the db
right_paper = (db.query(database.Paper).
filter_by(doi=doi).first())
right_paper = (db.query(database.Paper)
.filter(getattr(database.Paper, type) == identifier)
.first())
if right_paper is None:
# If paper does not exist in db, add it
right_paper = create_by_doi(doi, db)
# Update cite relationship for this paper, recursively
# TODO: Known bug: too many levels of recursion!
# add_cite_relationship(right_paper, db)
# If paper is not in db, add it
if type == "doi":
right_paper = create_by_doi(identifier, db)
elif type == "arxiv_id":
right_paper = create_by_arxiv(identifier, db)
else:
continue
# Push this paper on the queue for update of cite relationships
queue = database.CitationProcessingQueue()
queue.paper = right_paper
try:
db.add(queue)
except IntegrityError:
# Unique constraint violation, relationship already exists
db.rollback()
# Update the relationships
update_relationship_backend(paper.id, right_paper.id, "cite", db)
# If paper is not on arXiv, nothing to do
@ -177,6 +194,39 @@ def add_cite_relationship(paper, db):
return
def fetch_citations_in_queue(create_session):
"""
Process the first item in the queue, waiting for citation processing.
i.. note::
Calls itself recursively after the time defined in ``config``, so
that queued articles are processed concurrently.
:param create_session: a ``SQLAlchemy`` ``sessionmaker``.
:returns: Nothing.
"""
# Get a db Session
db = create_session()
queued = db.query(database.CitationProcessingQueue).first()
if queued:
print("Processing citation relationships for %s." % (queued.paper,))
# Process this paper
add_cite_relationship(queued.paper, db)
# Remove this paper from queue
db.delete(queued)
# Commit to the database
try:
db.commit()
except:
db.rollback()
# Call this function again after a while
threading.Timer(
config.queue_polling_interval,
lambda: fetch_citations_in_queue(create_session)
).start()
def update_relationships(id, name, db):
"""
Update the relationships associated to a given paper.

View File

@ -18,6 +18,27 @@ def pretty_json(data):
separators=(',', ': '))
def get_identifier_from_url(url):
"""
Get the identifier out of a DOI or arXiv URL.
:param url: An input URL.
:returns: A tuple ``(type, identifier)``. Returns ``(None, None)`` if \
could not match.
"""
type = None
identifier = None
if "dx.doi.org" in url:
type = "doi"
identifier = url[url.find("dx.doi.org") + 11:]
elif "arxiv.org/abs" in url:
type = "arxiv_id"
identifier = url[url.find("arxiv.org/abs/") + 14:]
return (type, identifier)
class APIResponse(bottle.HTTPResponse):
"""
Extend bottle.HTTPResponse base class to add Content-Type header.