A python library to deal with scientific papers.

bbl.py 2.8KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. """
  2. This files contains all the functions to extract DOIs of citations from .bbl
  3. files.
  4. """
  5. import os
  6. import re
  7. import subprocess
  8. from libbmc import tools
  9. from libbmc.citations import plaintext
  10. # Regex to match bibitems
  11. BIBITEMS_REGEX = re.compile(r"\\bibitem\{.+?\}")
  12. # Regex to match end of bibliography
  13. ENDTHEBIBLIOGRAPHY_REGEX = re.compile(r"\\end\{thebibliography}.*")
  14. def bibitem_as_plaintext(bibitem):
  15. """
  16. Return a plaintext representation of a bibitem from the ``.bbl`` file.
  17. .. note::
  18. This plaintext representation can be super ugly, contain URLs and so \
  19. on.
  20. .. note::
  21. You need to have ``delatex`` installed system-wide, or to build it in \
  22. this repo, according to the ``README.md`` before using this \
  23. function.
  24. :param bibitem: The text content of the bibitem.
  25. :returns: A cleaned plaintext citation from the bibitem.
  26. """
  27. try:
  28. output = subprocess.check_output(["delatex",
  29. "-s"],
  30. input=bibitem.encode("utf-8"))
  31. except FileNotFoundError:
  32. script_dir = os.path.dirname(os.path.abspath(__file__))
  33. output = subprocess.check_output(["%s/../external/opendetex/delatex" %
  34. (script_dir,),
  35. "-s"],
  36. input=bibitem.encode("utf-8"))
  37. output = output.decode("utf-8")
  38. output = tools.clean_whitespaces(output)
  39. return output
  40. def get_plaintext_citations(bbl):
  41. """
  42. Parse a ``*.bbl`` file to get a clean list of plaintext citations.
  43. :param bbl: Either the path to the .bbl file or the content of a ``.bbl`` \
  44. file.
  45. :returns: A list of cleaned plaintext citations.
  46. """
  47. # Handle path or content
  48. if os.path.isfile(bbl):
  49. with open(bbl, 'r') as fh:
  50. bbl_content = fh.read()
  51. else:
  52. bbl_content = bbl
  53. # Get a list of bibitems, taking the first item out as it is *before* the
  54. # first \bibitem
  55. bibitems = BIBITEMS_REGEX.split(bbl_content)[1:]
  56. # Delete the text after the \end{thebibliography}
  57. bibitems = [ENDTHEBIBLIOGRAPHY_REGEX.sub("", i).strip() for i in bibitems]
  58. # Clean every bibitem to have plaintext
  59. cleaned_bbl = [bibitem_as_plaintext(bibitem) for bibitem in bibitems]
  60. return cleaned_bbl
  61. def get_cited_DOIs(bbl):
  62. """
  63. Get the DOIs of the papers cited in a .bbl file.
  64. :param bbl: Either the path to a .bbl file or the content \
  65. of a .bbl file.
  66. :returns: A dict of cleaned plaintext citations and their associated DOI.
  67. """
  68. # Get the plaintext citations from the bbl file
  69. plaintext_citations = get_plaintext_citations(bbl)
  70. # Use the plaintext citations parser on these citations
  71. return plaintext.get_cited_DOIs(plaintext_citations)