diff --git a/README.md b/README.md index 8dcb003..776a4c1 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,9 @@ This list is especially useful for the `libbmc.papers.identifiers` module, which is using it to loop through all the available identifier types, to fetch for them in the paper and retrieve BibTeX from it. +You can also write a specific citation extraction module for this repository +in `libbmc/citations/repositories/{REPOSITORY}.py`. + ## License diff --git a/libbmc/citations/bbl.py b/libbmc/citations/bbl.py index 71be937..427c195 100644 --- a/libbmc/citations/bbl.py +++ b/libbmc/citations/bbl.py @@ -1,5 +1,5 @@ """ -This files contains all the functions to extract DOIs of citations from .bbl +This file contains all the functions to extract DOIs of citations from .bbl files. """ import os diff --git a/libbmc/citations/bibtex.py b/libbmc/citations/bibtex.py index 7d03bc3..b8574eb 100644 --- a/libbmc/citations/bibtex.py +++ b/libbmc/citations/bibtex.py @@ -1,5 +1,5 @@ """ -This files contains all the functions to extract DOIs of citations from +This file contains all the functions to extract DOIs of citations from BibTeX files. """ import os diff --git a/libbmc/citations/pdf.py b/libbmc/citations/pdf.py index d6f8eb4..aa7aaf4 100644 --- a/libbmc/citations/pdf.py +++ b/libbmc/citations/pdf.py @@ -1,5 +1,5 @@ """ -This files contains all the functions to extract DOIs of citations from +This file contains all the functions to extract DOIs of citations from PDF files. """ import os diff --git a/libbmc/citations/plaintext.py b/libbmc/citations/plaintext.py index 16647a7..21b9949 100644 --- a/libbmc/citations/plaintext.py +++ b/libbmc/citations/plaintext.py @@ -1,5 +1,5 @@ """ -This files contains all the functions to extract DOIs of citations from +This file contains all the functions to extract DOIs of citations from plaintext files. """ import os diff --git a/libbmc/citations/repositories/__init__.py b/libbmc/citations/repositories/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libbmc/citations/repositories/arxiv.py b/libbmc/citations/repositories/arxiv.py new file mode 100644 index 0000000..1a86df6 --- /dev/null +++ b/libbmc/citations/repositories/arxiv.py @@ -0,0 +1,50 @@ +""" +This file contains all the functions to extract DOIs of citations from arXiv +papers. +""" +from libbmc.citations import bbl +from libbmc.repositories import arxiv + + +def get_plaintext_citations(arxiv_id): + """ + Get the citations of a given preprint, in plain text. + + .. note:: + + Bulk download of sources from arXiv is not permitted by their API. \ + You should have a look at http://arxiv.org/help/bulk_data_s3. + + :param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \ + a canonical form. + :returns: A list of cleaned plaintext citations. + """ + plaintext_citations = [] + # Get the list of bbl files for this preprint + bbl_files = arxiv.get_bbl(arxiv_id) + for bbl_file in bbl_files: + # Fetch the cited DOIs for each of the bbl files + plaintext_citations.extend(bbl.get_plaintext_citations(bbl_file)) + return plaintext_citations + + +def get_cited_dois(arxiv_id): + """ + Get the DOIs of the papers cited in a .bbl file. + + .. note:: + + Bulk download of sources from arXiv is not permitted by their API. \ + You should have a look at http://arxiv.org/help/bulk_data_s3. + + :param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \ + a canonical form. + :returns: A dict of cleaned plaintext citations and their associated DOI. + """ + dois = {} + # Get the list of bbl files for this preprint + bbl_files = arxiv.get_bbl(arxiv_id) + for bbl_file in bbl_files: + # Fetch the cited DOIs for each of the bbl files + dois.update(bbl.get_cited_dois(bbl_file)) + return dois diff --git a/libbmc/repositories/arxiv.py b/libbmc/repositories/arxiv.py index 38b5eee..2f55baa 100644 --- a/libbmc/repositories/arxiv.py +++ b/libbmc/repositories/arxiv.py @@ -18,7 +18,6 @@ from requests.exceptions import RequestException from libbmc import __valid_identifiers__ from libbmc import tools -from libbmc.citations import bbl # Append arXiv to the valid identifiers list __valid_identifiers__ += ["repositories.arxiv"] @@ -475,25 +474,3 @@ def get_bbl(arxiv_id): bbl_files = [tar_file.extractfile(member).read().decode(tarfile.ENCODING) for member in bbl_files] return bbl_files - - -def get_citations(arxiv_id): - """ - Get the DOIs cited by a given preprint. - - .. note:: - - Bulk download of sources from arXiv is not permitted by their API. \ - You should have a look at http://arxiv.org/help/bulk_data_s3. - - :param arxiv_id: The arXiv id (e.g. ``1401.2910`` or ``1401.2910v1``) in \ - a canonical form. - :returns: A dict of cleaned plaintext citations and their associated DOI. - """ - dois = {} - # Get the list of bbl files for this preprint - bbl_files = get_bbl(arxiv_id) - for bbl_file in bbl_files: - # Fetch the cited DOIs for each of the bbl files - dois.update(bbl.get_cited_dois(bbl_file)) - return dois