Add a queue to store papers waiting for citation processing
* Also add a delete endpoint to delete a tag. * Various fixes in the code. * Better doc.
This commit is contained in:
parent
4a67e974b8
commit
ffe454a558
10
README.md
10
README.md
@ -379,6 +379,16 @@ Accept: application/vnd.api+json
|
|||||||
Response is empty HTTP 204.
|
Response is empty HTTP 204.
|
||||||
|
|
||||||
|
|
||||||
|
### Delete a tag
|
||||||
|
|
||||||
|
```
|
||||||
|
DELETE /tags/1
|
||||||
|
Accept: application/vnd.api+json
|
||||||
|
```
|
||||||
|
|
||||||
|
Response is empty HTTP 204.
|
||||||
|
|
||||||
|
|
||||||
### Delete a relationship between two papers
|
### Delete a relationship between two papers
|
||||||
|
|
||||||
```
|
```
|
||||||
|
23
database.py
23
database.py
@ -35,15 +35,12 @@ class RelationshipAssociation(Base):
|
|||||||
ondelete="CASCADE"))
|
ondelete="CASCADE"))
|
||||||
right_paper = sqlalchemy_relationship("Paper",
|
right_paper = sqlalchemy_relationship("Paper",
|
||||||
foreign_keys=right_id,
|
foreign_keys=right_id,
|
||||||
back_populates="related_by",
|
back_populates="related_by")
|
||||||
passive_deletes=True)
|
relationship = sqlalchemy_relationship("Relationship")
|
||||||
relationship = sqlalchemy_relationship("Relationship",
|
|
||||||
passive_deletes=True)
|
|
||||||
|
|
||||||
left_paper = sqlalchemy_relationship("Paper",
|
left_paper = sqlalchemy_relationship("Paper",
|
||||||
foreign_keys=left_id,
|
foreign_keys=left_id,
|
||||||
back_populates="related_to",
|
back_populates="related_to")
|
||||||
passive_deletes=True)
|
|
||||||
|
|
||||||
tag_association_table = Table(
|
tag_association_table = Table(
|
||||||
'tag_association', Base.metadata,
|
'tag_association', Base.metadata,
|
||||||
@ -56,7 +53,7 @@ class Paper(Base):
|
|||||||
__tablename__ = "papers"
|
__tablename__ = "papers"
|
||||||
id = Column(Integer, primary_key=True)
|
id = Column(Integer, primary_key=True)
|
||||||
doi = Column(String(), nullable=True, unique=True)
|
doi = Column(String(), nullable=True, unique=True)
|
||||||
arxiv_id = Column(String(25), nullable=True, unique=True)
|
arxiv_id = Column(String(30), nullable=True, unique=True)
|
||||||
# related_to are papers related to this paper (this_paper R …)
|
# related_to are papers related to this paper (this_paper R …)
|
||||||
related_to = sqlalchemy_relationship("RelationshipAssociation",
|
related_to = sqlalchemy_relationship("RelationshipAssociation",
|
||||||
foreign_keys="RelationshipAssociation.left_id",
|
foreign_keys="RelationshipAssociation.left_id",
|
||||||
@ -70,7 +67,8 @@ class Paper(Base):
|
|||||||
# Tags relationship
|
# Tags relationship
|
||||||
tags = sqlalchemy_relationship("Tag",
|
tags = sqlalchemy_relationship("Tag",
|
||||||
secondary=tag_association_table,
|
secondary=tag_association_table,
|
||||||
backref="papers")
|
backref="papers",
|
||||||
|
passive_deletes=True)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "<Paper(id='%d', doi='%s', arxiv_id='%s')>" % (
|
return "<Paper(id='%d', doi='%s', arxiv_id='%s')>" % (
|
||||||
@ -142,3 +140,12 @@ class Tag(Base):
|
|||||||
"self": "/tags/%d" % (self.id,)
|
"self": "/tags/%d" % (self.id,)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class CitationProcessingQueue(Base):
|
||||||
|
__tablename__ = "citationprocessingqueue"
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
paper_id = Column(Integer,
|
||||||
|
ForeignKey('papers.id', ondelete="CASCADE"),
|
||||||
|
unique=True)
|
||||||
|
paper = sqlalchemy_relationship("Paper")
|
||||||
|
12
main.py
12
main.py
@ -2,6 +2,7 @@
|
|||||||
import bottle
|
import bottle
|
||||||
from bottle.ext import sqlalchemy
|
from bottle.ext import sqlalchemy
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
|
||||||
import config
|
import config
|
||||||
import database
|
import database
|
||||||
@ -10,6 +11,8 @@ import tools
|
|||||||
|
|
||||||
# Initialize db and include the SQLAlchemy plugin in bottle
|
# Initialize db and include the SQLAlchemy plugin in bottle
|
||||||
engine = create_engine('sqlite:///%s' % (config.database,), echo=True)
|
engine = create_engine('sqlite:///%s' % (config.database,), echo=True)
|
||||||
|
create_session = sessionmaker(bind=engine)
|
||||||
|
database.Base.metadata.create_all(engine)
|
||||||
|
|
||||||
app = bottle.Bottle()
|
app = bottle.Bottle()
|
||||||
plugin = sqlalchemy.Plugin(
|
plugin = sqlalchemy.Plugin(
|
||||||
@ -21,13 +24,15 @@ plugin = sqlalchemy.Plugin(
|
|||||||
keyword='db',
|
keyword='db',
|
||||||
# If it is true, execute `metadata.create_all(engine)` when plugin is
|
# If it is true, execute `metadata.create_all(engine)` when plugin is
|
||||||
# applied (default False).
|
# applied (default False).
|
||||||
create=True,
|
create=False,
|
||||||
# If it is true, plugin commit changes after route is executed (default
|
# If it is true, plugin commit changes after route is executed (default
|
||||||
# True).
|
# True).
|
||||||
commit=True,
|
commit=True,
|
||||||
# If it is true and keyword is not defined, plugin uses **kwargs argument
|
# If it is true and keyword is not defined, plugin uses **kwargs argument
|
||||||
# to inject session database (default False).
|
# to inject session database (default False).
|
||||||
use_kwargs=False
|
use_kwargs=False,
|
||||||
|
# Create session method
|
||||||
|
create_session=create_session
|
||||||
)
|
)
|
||||||
|
|
||||||
app.install(plugin)
|
app.install(plugin)
|
||||||
@ -53,6 +58,8 @@ app.route("/papers/<id:int>/relationships/<name>", method="DELETE",
|
|||||||
|
|
||||||
app.get("/tags", callback=routes.get.fetch_tags)
|
app.get("/tags", callback=routes.get.fetch_tags)
|
||||||
app.get("/tags/<id:int>", callback=routes.get.fetch_tags_by_id)
|
app.get("/tags/<id:int>", callback=routes.get.fetch_tags_by_id)
|
||||||
|
app.route("/tags/<id:int>", method="DELETE",
|
||||||
|
callback=routes.delete.delete_tag)
|
||||||
|
|
||||||
|
|
||||||
app.post("/papers", callback=routes.post.create_paper)
|
app.post("/papers", callback=routes.post.create_paper)
|
||||||
@ -66,4 +73,5 @@ app.route("/papers/<id:int>/relationships/<name>", method="PATCH",
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
routes.post.fetch_citations_in_queue(create_session)
|
||||||
app.run(host=config.host, port=config.port, debug=(not config.production))
|
app.run(host=config.host, port=config.port, debug=(not config.production))
|
||||||
|
@ -97,3 +97,28 @@ def delete_relationship(id, name, db):
|
|||||||
db.delete(relationship)
|
db.delete(relationship)
|
||||||
# Return an empty 204 on success
|
# Return an empty 204 on success
|
||||||
return tools.APIResponse(status=204, body="")
|
return tools.APIResponse(status=204, body="")
|
||||||
|
|
||||||
|
|
||||||
|
def delete_tag(id, db):
|
||||||
|
"""
|
||||||
|
Delete a given tag.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
DELETE /tags/1
|
||||||
|
Accept: application/vnd.api+json
|
||||||
|
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
HTTP 204
|
||||||
|
|
||||||
|
:param id: The id of the requested tag to be deleted.
|
||||||
|
:param db: A database session, injected by the ``Bottle`` plugin.
|
||||||
|
:returns: An ``HTTPResponse``.
|
||||||
|
"""
|
||||||
|
resource = db.query(database.Tag).filter_by(id=id).first()
|
||||||
|
if resource:
|
||||||
|
db.delete(resource)
|
||||||
|
return tools.APIResponse(status=204, body="")
|
||||||
|
return bottle.HTTPError(404, "Not found")
|
||||||
|
@ -3,8 +3,10 @@ This file contains POST routes methods.
|
|||||||
"""
|
"""
|
||||||
import bottle
|
import bottle
|
||||||
import json
|
import json
|
||||||
|
import threading
|
||||||
from sqlalchemy.exc import IntegrityError
|
from sqlalchemy.exc import IntegrityError
|
||||||
|
|
||||||
|
import config
|
||||||
import database
|
import database
|
||||||
import tools
|
import tools
|
||||||
from reference_fetcher import arxiv
|
from reference_fetcher import arxiv
|
||||||
@ -115,19 +117,19 @@ def create_by_doi(doi, db):
|
|||||||
return paper
|
return paper
|
||||||
|
|
||||||
|
|
||||||
def create_by_arxiv(arxiv, db):
|
def create_by_arxiv(arxiv_id, db):
|
||||||
"""
|
"""
|
||||||
Create a new resource identified by its arXiv eprint ID, if it does not
|
Create a new resource identified by its arXiv eprint ID, if it does not
|
||||||
exist.
|
exist.
|
||||||
|
|
||||||
:param arxiv: The arXiv eprint ID.
|
:param arxiv_id: The arXiv eprint ID.
|
||||||
:param db: A database session.
|
:param db: A database session.
|
||||||
:returns: ``None`` if insertion failed, the ``Paper`` object otherwise.
|
:returns: ``None`` if insertion failed, the ``Paper`` object otherwise.
|
||||||
"""
|
"""
|
||||||
paper = database.Paper(arxiv_id=arxiv)
|
paper = database.Paper(arxiv_id=arxiv_id)
|
||||||
|
|
||||||
# Try to fetch an arXiv id
|
# Try to fetch an arXiv id
|
||||||
doi = arxiv.get_doi(arxiv)
|
doi = arxiv.get_doi(arxiv_id)
|
||||||
if doi:
|
if doi:
|
||||||
paper.doi = doi
|
paper.doi = doi
|
||||||
|
|
||||||
@ -156,20 +158,35 @@ def add_cite_relationship(paper, db):
|
|||||||
# If paper is on arXiv
|
# If paper is on arXiv
|
||||||
if paper.arxiv_id is not None:
|
if paper.arxiv_id is not None:
|
||||||
# Get the cited DOIs
|
# Get the cited DOIs
|
||||||
cited_dois = arxiv.get_cited_dois(paper.arxiv_id)
|
cited_urls = arxiv.get_cited_dois(paper.arxiv_id)
|
||||||
# Filter out the ones that were not matched
|
# Filter out the ones that were not matched
|
||||||
cited_dois = [cited_dois[k]
|
cited_urls = [cited_urls[k]
|
||||||
for k in cited_dois if cited_dois[k] is not None]
|
for k in cited_urls if cited_urls[k] is not None]
|
||||||
for doi in cited_dois:
|
for url in cited_urls:
|
||||||
|
type, identifier = tools.get_identifier_from_url(url)
|
||||||
|
if type is None:
|
||||||
|
# No identifier found
|
||||||
|
continue
|
||||||
# Get the associated paper in the db
|
# Get the associated paper in the db
|
||||||
right_paper = (db.query(database.Paper).
|
right_paper = (db.query(database.Paper)
|
||||||
filter_by(doi=doi).first())
|
.filter(getattr(database.Paper, type) == identifier)
|
||||||
|
.first())
|
||||||
if right_paper is None:
|
if right_paper is None:
|
||||||
# If paper does not exist in db, add it
|
# If paper is not in db, add it
|
||||||
right_paper = create_by_doi(doi, db)
|
if type == "doi":
|
||||||
# Update cite relationship for this paper, recursively
|
right_paper = create_by_doi(identifier, db)
|
||||||
# TODO: Known bug: too many levels of recursion!
|
elif type == "arxiv_id":
|
||||||
# add_cite_relationship(right_paper, db)
|
right_paper = create_by_arxiv(identifier, db)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
# Push this paper on the queue for update of cite relationships
|
||||||
|
queue = database.CitationProcessingQueue()
|
||||||
|
queue.paper = right_paper
|
||||||
|
try:
|
||||||
|
db.add(queue)
|
||||||
|
except IntegrityError:
|
||||||
|
# Unique constraint violation, relationship already exists
|
||||||
|
db.rollback()
|
||||||
# Update the relationships
|
# Update the relationships
|
||||||
update_relationship_backend(paper.id, right_paper.id, "cite", db)
|
update_relationship_backend(paper.id, right_paper.id, "cite", db)
|
||||||
# If paper is not on arXiv, nothing to do
|
# If paper is not on arXiv, nothing to do
|
||||||
@ -177,6 +194,39 @@ def add_cite_relationship(paper, db):
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_citations_in_queue(create_session):
|
||||||
|
"""
|
||||||
|
Process the first item in the queue, waiting for citation processing.
|
||||||
|
|
||||||
|
i.. note::
|
||||||
|
|
||||||
|
Calls itself recursively after the time defined in ``config``, so
|
||||||
|
that queued articles are processed concurrently.
|
||||||
|
|
||||||
|
:param create_session: a ``SQLAlchemy`` ``sessionmaker``.
|
||||||
|
:returns: Nothing.
|
||||||
|
"""
|
||||||
|
# Get a db Session
|
||||||
|
db = create_session()
|
||||||
|
queued = db.query(database.CitationProcessingQueue).first()
|
||||||
|
if queued:
|
||||||
|
print("Processing citation relationships for %s." % (queued.paper,))
|
||||||
|
# Process this paper
|
||||||
|
add_cite_relationship(queued.paper, db)
|
||||||
|
# Remove this paper from queue
|
||||||
|
db.delete(queued)
|
||||||
|
# Commit to the database
|
||||||
|
try:
|
||||||
|
db.commit()
|
||||||
|
except:
|
||||||
|
db.rollback()
|
||||||
|
# Call this function again after a while
|
||||||
|
threading.Timer(
|
||||||
|
config.queue_polling_interval,
|
||||||
|
lambda: fetch_citations_in_queue(create_session)
|
||||||
|
).start()
|
||||||
|
|
||||||
|
|
||||||
def update_relationships(id, name, db):
|
def update_relationships(id, name, db):
|
||||||
"""
|
"""
|
||||||
Update the relationships associated to a given paper.
|
Update the relationships associated to a given paper.
|
||||||
|
21
tools.py
21
tools.py
@ -18,6 +18,27 @@ def pretty_json(data):
|
|||||||
separators=(',', ': '))
|
separators=(',', ': '))
|
||||||
|
|
||||||
|
|
||||||
|
def get_identifier_from_url(url):
|
||||||
|
"""
|
||||||
|
Get the identifier out of a DOI or arXiv URL.
|
||||||
|
|
||||||
|
:param url: An input URL.
|
||||||
|
:returns: A tuple ``(type, identifier)``. Returns ``(None, None)`` if \
|
||||||
|
could not match.
|
||||||
|
"""
|
||||||
|
type = None
|
||||||
|
identifier = None
|
||||||
|
|
||||||
|
if "dx.doi.org" in url:
|
||||||
|
type = "doi"
|
||||||
|
identifier = url[url.find("dx.doi.org") + 11:]
|
||||||
|
elif "arxiv.org/abs" in url:
|
||||||
|
type = "arxiv_id"
|
||||||
|
identifier = url[url.find("arxiv.org/abs/") + 14:]
|
||||||
|
|
||||||
|
return (type, identifier)
|
||||||
|
|
||||||
|
|
||||||
class APIResponse(bottle.HTTPResponse):
|
class APIResponse(bottle.HTTPResponse):
|
||||||
"""
|
"""
|
||||||
Extend bottle.HTTPResponse base class to add Content-Type header.
|
Extend bottle.HTTPResponse base class to add Content-Type header.
|
||||||
|
Loading…
Reference in New Issue
Block a user