Add a queue to store papers waiting for citation processing
* Also add a delete endpoint to delete a tag. * Various fixes in the code. * Better doc.
This commit is contained in:
parent
4a67e974b8
commit
ffe454a558
10
README.md
10
README.md
@ -379,6 +379,16 @@ Accept: application/vnd.api+json
|
||||
Response is empty HTTP 204.
|
||||
|
||||
|
||||
### Delete a tag
|
||||
|
||||
```
|
||||
DELETE /tags/1
|
||||
Accept: application/vnd.api+json
|
||||
```
|
||||
|
||||
Response is empty HTTP 204.
|
||||
|
||||
|
||||
### Delete a relationship between two papers
|
||||
|
||||
```
|
||||
|
23
database.py
23
database.py
@ -35,15 +35,12 @@ class RelationshipAssociation(Base):
|
||||
ondelete="CASCADE"))
|
||||
right_paper = sqlalchemy_relationship("Paper",
|
||||
foreign_keys=right_id,
|
||||
back_populates="related_by",
|
||||
passive_deletes=True)
|
||||
relationship = sqlalchemy_relationship("Relationship",
|
||||
passive_deletes=True)
|
||||
back_populates="related_by")
|
||||
relationship = sqlalchemy_relationship("Relationship")
|
||||
|
||||
left_paper = sqlalchemy_relationship("Paper",
|
||||
foreign_keys=left_id,
|
||||
back_populates="related_to",
|
||||
passive_deletes=True)
|
||||
back_populates="related_to")
|
||||
|
||||
tag_association_table = Table(
|
||||
'tag_association', Base.metadata,
|
||||
@ -56,7 +53,7 @@ class Paper(Base):
|
||||
__tablename__ = "papers"
|
||||
id = Column(Integer, primary_key=True)
|
||||
doi = Column(String(), nullable=True, unique=True)
|
||||
arxiv_id = Column(String(25), nullable=True, unique=True)
|
||||
arxiv_id = Column(String(30), nullable=True, unique=True)
|
||||
# related_to are papers related to this paper (this_paper R …)
|
||||
related_to = sqlalchemy_relationship("RelationshipAssociation",
|
||||
foreign_keys="RelationshipAssociation.left_id",
|
||||
@ -70,7 +67,8 @@ class Paper(Base):
|
||||
# Tags relationship
|
||||
tags = sqlalchemy_relationship("Tag",
|
||||
secondary=tag_association_table,
|
||||
backref="papers")
|
||||
backref="papers",
|
||||
passive_deletes=True)
|
||||
|
||||
def __repr__(self):
|
||||
return "<Paper(id='%d', doi='%s', arxiv_id='%s')>" % (
|
||||
@ -142,3 +140,12 @@ class Tag(Base):
|
||||
"self": "/tags/%d" % (self.id,)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class CitationProcessingQueue(Base):
|
||||
__tablename__ = "citationprocessingqueue"
|
||||
id = Column(Integer, primary_key=True)
|
||||
paper_id = Column(Integer,
|
||||
ForeignKey('papers.id', ondelete="CASCADE"),
|
||||
unique=True)
|
||||
paper = sqlalchemy_relationship("Paper")
|
||||
|
12
main.py
12
main.py
@ -2,6 +2,7 @@
|
||||
import bottle
|
||||
from bottle.ext import sqlalchemy
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
import config
|
||||
import database
|
||||
@ -10,6 +11,8 @@ import tools
|
||||
|
||||
# Initialize db and include the SQLAlchemy plugin in bottle
|
||||
engine = create_engine('sqlite:///%s' % (config.database,), echo=True)
|
||||
create_session = sessionmaker(bind=engine)
|
||||
database.Base.metadata.create_all(engine)
|
||||
|
||||
app = bottle.Bottle()
|
||||
plugin = sqlalchemy.Plugin(
|
||||
@ -21,13 +24,15 @@ plugin = sqlalchemy.Plugin(
|
||||
keyword='db',
|
||||
# If it is true, execute `metadata.create_all(engine)` when plugin is
|
||||
# applied (default False).
|
||||
create=True,
|
||||
create=False,
|
||||
# If it is true, plugin commit changes after route is executed (default
|
||||
# True).
|
||||
commit=True,
|
||||
# If it is true and keyword is not defined, plugin uses **kwargs argument
|
||||
# to inject session database (default False).
|
||||
use_kwargs=False
|
||||
use_kwargs=False,
|
||||
# Create session method
|
||||
create_session=create_session
|
||||
)
|
||||
|
||||
app.install(plugin)
|
||||
@ -53,6 +58,8 @@ app.route("/papers/<id:int>/relationships/<name>", method="DELETE",
|
||||
|
||||
app.get("/tags", callback=routes.get.fetch_tags)
|
||||
app.get("/tags/<id:int>", callback=routes.get.fetch_tags_by_id)
|
||||
app.route("/tags/<id:int>", method="DELETE",
|
||||
callback=routes.delete.delete_tag)
|
||||
|
||||
|
||||
app.post("/papers", callback=routes.post.create_paper)
|
||||
@ -66,4 +73,5 @@ app.route("/papers/<id:int>/relationships/<name>", method="PATCH",
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
routes.post.fetch_citations_in_queue(create_session)
|
||||
app.run(host=config.host, port=config.port, debug=(not config.production))
|
||||
|
@ -97,3 +97,28 @@ def delete_relationship(id, name, db):
|
||||
db.delete(relationship)
|
||||
# Return an empty 204 on success
|
||||
return tools.APIResponse(status=204, body="")
|
||||
|
||||
|
||||
def delete_tag(id, db):
|
||||
"""
|
||||
Delete a given tag.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
DELETE /tags/1
|
||||
Accept: application/vnd.api+json
|
||||
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
HTTP 204
|
||||
|
||||
:param id: The id of the requested tag to be deleted.
|
||||
:param db: A database session, injected by the ``Bottle`` plugin.
|
||||
:returns: An ``HTTPResponse``.
|
||||
"""
|
||||
resource = db.query(database.Tag).filter_by(id=id).first()
|
||||
if resource:
|
||||
db.delete(resource)
|
||||
return tools.APIResponse(status=204, body="")
|
||||
return bottle.HTTPError(404, "Not found")
|
||||
|
@ -3,8 +3,10 @@ This file contains POST routes methods.
|
||||
"""
|
||||
import bottle
|
||||
import json
|
||||
import threading
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
|
||||
import config
|
||||
import database
|
||||
import tools
|
||||
from reference_fetcher import arxiv
|
||||
@ -115,19 +117,19 @@ def create_by_doi(doi, db):
|
||||
return paper
|
||||
|
||||
|
||||
def create_by_arxiv(arxiv, db):
|
||||
def create_by_arxiv(arxiv_id, db):
|
||||
"""
|
||||
Create a new resource identified by its arXiv eprint ID, if it does not
|
||||
exist.
|
||||
|
||||
:param arxiv: The arXiv eprint ID.
|
||||
:param arxiv_id: The arXiv eprint ID.
|
||||
:param db: A database session.
|
||||
:returns: ``None`` if insertion failed, the ``Paper`` object otherwise.
|
||||
"""
|
||||
paper = database.Paper(arxiv_id=arxiv)
|
||||
paper = database.Paper(arxiv_id=arxiv_id)
|
||||
|
||||
# Try to fetch an arXiv id
|
||||
doi = arxiv.get_doi(arxiv)
|
||||
doi = arxiv.get_doi(arxiv_id)
|
||||
if doi:
|
||||
paper.doi = doi
|
||||
|
||||
@ -156,20 +158,35 @@ def add_cite_relationship(paper, db):
|
||||
# If paper is on arXiv
|
||||
if paper.arxiv_id is not None:
|
||||
# Get the cited DOIs
|
||||
cited_dois = arxiv.get_cited_dois(paper.arxiv_id)
|
||||
cited_urls = arxiv.get_cited_dois(paper.arxiv_id)
|
||||
# Filter out the ones that were not matched
|
||||
cited_dois = [cited_dois[k]
|
||||
for k in cited_dois if cited_dois[k] is not None]
|
||||
for doi in cited_dois:
|
||||
cited_urls = [cited_urls[k]
|
||||
for k in cited_urls if cited_urls[k] is not None]
|
||||
for url in cited_urls:
|
||||
type, identifier = tools.get_identifier_from_url(url)
|
||||
if type is None:
|
||||
# No identifier found
|
||||
continue
|
||||
# Get the associated paper in the db
|
||||
right_paper = (db.query(database.Paper).
|
||||
filter_by(doi=doi).first())
|
||||
right_paper = (db.query(database.Paper)
|
||||
.filter(getattr(database.Paper, type) == identifier)
|
||||
.first())
|
||||
if right_paper is None:
|
||||
# If paper does not exist in db, add it
|
||||
right_paper = create_by_doi(doi, db)
|
||||
# Update cite relationship for this paper, recursively
|
||||
# TODO: Known bug: too many levels of recursion!
|
||||
# add_cite_relationship(right_paper, db)
|
||||
# If paper is not in db, add it
|
||||
if type == "doi":
|
||||
right_paper = create_by_doi(identifier, db)
|
||||
elif type == "arxiv_id":
|
||||
right_paper = create_by_arxiv(identifier, db)
|
||||
else:
|
||||
continue
|
||||
# Push this paper on the queue for update of cite relationships
|
||||
queue = database.CitationProcessingQueue()
|
||||
queue.paper = right_paper
|
||||
try:
|
||||
db.add(queue)
|
||||
except IntegrityError:
|
||||
# Unique constraint violation, relationship already exists
|
||||
db.rollback()
|
||||
# Update the relationships
|
||||
update_relationship_backend(paper.id, right_paper.id, "cite", db)
|
||||
# If paper is not on arXiv, nothing to do
|
||||
@ -177,6 +194,39 @@ def add_cite_relationship(paper, db):
|
||||
return
|
||||
|
||||
|
||||
def fetch_citations_in_queue(create_session):
|
||||
"""
|
||||
Process the first item in the queue, waiting for citation processing.
|
||||
|
||||
i.. note::
|
||||
|
||||
Calls itself recursively after the time defined in ``config``, so
|
||||
that queued articles are processed concurrently.
|
||||
|
||||
:param create_session: a ``SQLAlchemy`` ``sessionmaker``.
|
||||
:returns: Nothing.
|
||||
"""
|
||||
# Get a db Session
|
||||
db = create_session()
|
||||
queued = db.query(database.CitationProcessingQueue).first()
|
||||
if queued:
|
||||
print("Processing citation relationships for %s." % (queued.paper,))
|
||||
# Process this paper
|
||||
add_cite_relationship(queued.paper, db)
|
||||
# Remove this paper from queue
|
||||
db.delete(queued)
|
||||
# Commit to the database
|
||||
try:
|
||||
db.commit()
|
||||
except:
|
||||
db.rollback()
|
||||
# Call this function again after a while
|
||||
threading.Timer(
|
||||
config.queue_polling_interval,
|
||||
lambda: fetch_citations_in_queue(create_session)
|
||||
).start()
|
||||
|
||||
|
||||
def update_relationships(id, name, db):
|
||||
"""
|
||||
Update the relationships associated to a given paper.
|
||||
|
21
tools.py
21
tools.py
@ -18,6 +18,27 @@ def pretty_json(data):
|
||||
separators=(',', ': '))
|
||||
|
||||
|
||||
def get_identifier_from_url(url):
|
||||
"""
|
||||
Get the identifier out of a DOI or arXiv URL.
|
||||
|
||||
:param url: An input URL.
|
||||
:returns: A tuple ``(type, identifier)``. Returns ``(None, None)`` if \
|
||||
could not match.
|
||||
"""
|
||||
type = None
|
||||
identifier = None
|
||||
|
||||
if "dx.doi.org" in url:
|
||||
type = "doi"
|
||||
identifier = url[url.find("dx.doi.org") + 11:]
|
||||
elif "arxiv.org/abs" in url:
|
||||
type = "arxiv_id"
|
||||
identifier = url[url.find("arxiv.org/abs/") + 14:]
|
||||
|
||||
return (type, identifier)
|
||||
|
||||
|
||||
class APIResponse(bottle.HTTPResponse):
|
||||
"""
|
||||
Extend bottle.HTTPResponse base class to add Content-Type header.
|
||||
|
Loading…
Reference in New Issue
Block a user