Functions to handle arXiv metadata

This commit is contained in:
Phyks 2014-05-02 00:07:49 +02:00
parent 980e678883
commit 289c7dece4
3 changed files with 68 additions and 41 deletions

View File

@ -107,7 +107,7 @@ Here are some sources of inspirations for this project :
A list of ideas and TODO. Don't hesitate to give feedback on the ones you really want or to propose your owns. A list of ideas and TODO. Don't hesitate to give feedback on the ones you really want or to propose your owns.
20. No DOI for arXiv / HAL 20. No DOI for HAL
30. Parameter to disable remote search 30. Parameter to disable remote search
40. Open file 40. Open file
45. Doc / Man 45. Doc / Man
@ -119,3 +119,9 @@ A list of ideas and TODO. Don't hesitate to give feedback on the ones you really
## Issues ? ## Issues ?
* Remove the watermarks on pdf files => done, some warning in okular on generated pdf, but seems ok. Seems to be a bug in Okular. * Remove the watermarks on pdf files => done, some warning in okular on generated pdf, but seems ok. Seems to be a bug in Okular.
## Thanks
* Nathan Grigg for his [arxiv2bib](https://pypi.python.org/pypi/arxiv2bib/1.0.5#downloads) python module
* François Boulogne for his [python-bibtexparser](https://github.com/sciunto/python-bibtexparser) python module and his integration of new requested features

View File

@ -5,6 +5,7 @@ import isbntools
import re import re
import requesocks as requests # Requesocks is requests with SOCKS support import requesocks as requests # Requesocks is requests with SOCKS support
import subprocess import subprocess
import arxiv2bib as arxiv_metadata
import tools import tools
import params import params
@ -178,7 +179,6 @@ def doi2Bib(doi):
arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)') arXiv_re = re.compile(r'arXiv:\s*([\w\.\/\-]+)')
arXiv_wo_v_re = re.compile(r'v\d+\Z')
def findArXivId(src): def findArXivId(src):
@ -208,21 +208,19 @@ def findArXivId(src):
# Error happened # Error happened
tools.warning(err) tools.warning(err)
return False return False
else:
cleanID = False return extractID
if extractID:
cleanID = arXiv_wo_v_re.sub('', extractID.group(1))
return cleanID
def arXiv2Bib(arxiv): def arXiv2Bib(arxiv):
"""Returns bibTeX string of metadata for a given arXiv id """Returns bibTeX string of metadata for a given arXiv id
arxiv is an arxiv id arxiv is an arxiv id
From : https://github.com/minad/bibsync/blob/master/lib/bibsync/actions/synchronize_metadata.rb
""" """
arxiv = "oai:arXiv.org:"+arxiv bibtex = arxiv_metadata.arxiv2bib([arxiv])
bibtex = '' for bib in bibtex:
if isinstance(bib, arxiv_metadata.ReferenceErrorInfo):
return bibtex continue
else:
return bib.bibtex()
return False

79
main.py
View File

@ -18,7 +18,7 @@ EDITOR = os.environ.get('EDITOR') if os.environ.get('EDITOR') else 'vim'
def checkBibtex(filename, bibtex): def checkBibtex(filename, bibtex):
print("The bibtex entry found for "+filename+" is :") print("The bibtex entry found for "+filename+" is:")
bibtex = BibTexParser(bibtex, customization=homogeneize_latex_encoding) bibtex = BibTexParser(bibtex, customization=homogeneize_latex_encoding)
bibtex = bibtex.get_entry_dict() bibtex = bibtex.get_entry_dict()
@ -29,7 +29,7 @@ def checkBibtex(filename, bibtex):
else: else:
bibtex_string = '' bibtex_string = ''
print(bibtex_string) print(bibtex_string)
check = tools.rawInput("Is it correct ? [Y/n] ") check = tools.rawInput("Is it correct? [Y/n] ")
while check.lower() == 'n': while check.lower() == 'n':
with tempfile.NamedTemporaryFile(suffix=".tmp") as tmpfile: with tempfile.NamedTemporaryFile(suffix=".tmp") as tmpfile:
@ -46,9 +46,9 @@ def checkBibtex(filename, bibtex):
bibtex_string = backend.parsed2Bibtex(bibtex) bibtex_string = backend.parsed2Bibtex(bibtex)
else: else:
bibtex_string = '' bibtex_string = ''
print("\nThe bibtex entry for "+filename+" is :") print("\nThe bibtex entry for "+filename+" is:")
print(bibtex_string) print(bibtex_string)
check = tools.rawInput("Is it correct ? [Y/n] ") check = tools.rawInput("Is it correct? [Y/n] ")
return bibtex return bibtex
@ -58,37 +58,52 @@ def addFile(src, filetype):
""" """
if filetype == 'article' or filetype is None: if filetype == 'article' or filetype is None:
doi = fetcher.findDOI(src) doi = fetcher.findDOI(src)
if (filetype == 'article' or filetype is None) and doi is False:
arxiv = fetcher.findArXivId(src)
if filetype == 'book' or (filetype is None and doi is False): if filetype == 'book' or (filetype is None and doi is False and arxiv is
False):
isbn = fetcher.findISBN(src) isbn = fetcher.findISBN(src)
if doi is False and isbn is False: if doi is False and isbn is False and arxiv is False:
if filetype is None: if filetype is None:
tools.warning("Could not determine the DOI or the ISBN for " + tools.warning("Could not determine the DOI nor the arXiv id nor " +
src+"."+"Switching to manual entry.") "the ISBN for "+src+"."+"Switching to manual entry.")
doi_isbn = '' doi_arxiv_isbn = ''
while doi_isbn not in ['doi', 'isbn']: while doi_arxiv_isbn not in ['doi', 'arxiv', 'isbn']:
doi_isbn = tools.rawInput("DOI / ISBN ? ").lower() doi_arxiv_isbn = tools.rawInput("DOI / arXiv / ISBN? ").lower()
if doi_isbn == 'doi': if doi_arxiv_isbn == 'doi':
doi = tools.rawInput('DOI ? ') doi = tools.rawInput('DOI? ')
elif doi_arxiv_isbn == 'arxiv':
arxiv = tools.rawInput('arXiv id? ')
else: else:
isbn = tools.rawInput('ISBN ? ') isbn = tools.rawInput('ISBN? ')
elif filetype == 'article': elif filetype == 'article':
tools.warning("Could not determine the DOI for "+src + tools.warning("Could not determine the DOI nor the arXiv id for " +
", switching to manual entry.") src+", switching to manual entry.")
doi = tools.rawInput('DOI ? ') doi_arxiv = ''
while doi_arxiv not in ['doi', 'arxiv']:
doi_arxiv = tools.rawInput("DOI / arXiv? ").lower()
if doi_arxiv == 'doi':
doi = tools.rawInput('DOI? ')
else:
arxiv = tools.rawInput('arXiv id? ')
elif filetype == 'book': elif filetype == 'book':
tools.warning("Could not determine the ISBN for "+src + tools.warning("Could not determine the ISBN for "+src +
", switching to manual entry.") ", switching to manual entry.")
isbn = tools.rawInput('ISBN ? ') isbn = tools.rawInput('ISBN? ')
elif doi is not False: elif doi is not False:
print("DOI for "+src+" is "+doi+".") print("DOI for "+src+" is "+doi+".")
elif arxiv is not False:
print("ArXiv id for "+src+" is "+arxiv+".")
elif isbn is not False: elif isbn is not False:
print("ISBN for "+src+" is "+isbn+".") print("ISBN for "+src+" is "+isbn+".")
if doi is not False and doi != '': if doi is not False and doi != '':
# Add extra \n for bibtexparser # Add extra \n for bibtexparser
bibtex = fetcher.doi2Bib(doi).strip().replace(',', ",\n")+"\n" bibtex = fetcher.doi2Bib(doi).strip().replace(',', ",\n")+"\n"
elif arxiv is not False and arxiv != '':
bibtex = fetcher.arXiv2Bib(arxiv).strip().replace(',', ",\n")+"\n"
elif isbn is not False and isbn != '': elif isbn is not False and isbn != '':
# Idem # Idem
bibtex = fetcher.isbn2Bib(isbn).strip()+"\n" bibtex = fetcher.isbn2Bib(isbn).strip()+"\n"
@ -103,7 +118,7 @@ def addFile(src, filetype):
tools.warning("file "+new_name+" already exists.") tools.warning("file "+new_name+" already exists.")
default_rename = new_name.replace(tools.getExtension(new_name), default_rename = new_name.replace(tools.getExtension(new_name),
" (2)"+tools.getExtension(new_name)) " (2)"+tools.getExtension(new_name))
rename = tools.rawInput("New name ["+default_rename+"] ? ") rename = tools.rawInput("New name ["+default_rename+"]? ")
if rename == '': if rename == '':
new_name = default_rename new_name = default_rename
else: else:
@ -150,7 +165,7 @@ def resync():
while not confirm: while not confirm:
filename = tools.rawInput("File to import for this entry " + filename = tools.rawInput("File to import for this entry " +
"(leave empty to delete the " + "(leave empty to delete the " +
"entry) ? ") "entry)? ")
if filename == '': if filename == '':
break break
else: else:
@ -163,6 +178,14 @@ def resync():
"DOI, continue anyway " + "DOI, continue anyway " +
"? [y/N]") "? [y/N]")
confirm = (confirm.lower() == 'y') confirm = (confirm.lower() == 'y')
if 'Eprint' in entry.keys():
arxiv = fetcher.findArXivId(filename)
if arxiv is not False and arxiv != entry['Eprint']:
confirm = tools.rawInput("Found arXiv id does " +
"not match bibtex " +
"entry arxiv id, " +
"continue anyway ? [y/N]")
confirm = (confirm.lower() == 'y')
elif 'isbn' in entry.keys(): elif 'isbn' in entry.keys():
isbn = fetcher.findISBN(filename) isbn = fetcher.findISBN(filename)
if isbn is not False and isbn != entry['isbn']: if isbn is not False and isbn != entry['isbn']:
@ -187,7 +210,7 @@ def resync():
print("Found file without any associated entry in index.") print("Found file without any associated entry in index.")
action = '' action = ''
while action.lower() not in ['import', 'delete']: while action.lower() not in ['import', 'delete']:
action = tools.rawInput("What to do ? [import / delete] ") action = tools.rawInput("What to do? [import / delete] ")
action = action.lower() action = action.lower()
if action == 'import': if action == 'import':
tmp = tempfile.NamedTemporaryFile() tmp = tempfile.NamedTemporaryFile()
@ -209,11 +232,11 @@ def resync():
if __name__ == '__main__': if __name__ == '__main__':
try: try:
if len(sys.argv) < 2: if len(sys.argv) < 2:
sys.exit("Usage : TODO") sys.exit("Usage: TODO")
if sys.argv[1] == 'download': if sys.argv[1] == 'download':
if len(sys.argv) < 3: if len(sys.argv) < 3:
sys.exit("Usage : " + sys.argv[0] + sys.exit("Usage: " + sys.argv[0] +
" download FILE [article|book]") " download FILE [article|book]")
filetype = None filetype = None
@ -227,7 +250,7 @@ if __name__ == '__main__':
if sys.argv[1] == 'import': if sys.argv[1] == 'import':
if len(sys.argv) < 3: if len(sys.argv) < 3:
sys.exit("Usage : " + sys.argv[0] + sys.exit("Usage: " + sys.argv[0] +
" import FILE [article|book]") " import FILE [article|book]")
filetype = None filetype = None
@ -241,10 +264,10 @@ if __name__ == '__main__':
elif sys.argv[1] == 'delete': elif sys.argv[1] == 'delete':
if len(sys.argv) < 3: if len(sys.argv) < 3:
sys.exit("Usage : " + sys.argv[0] + " delete FILE|ID") sys.exit("Usage: " + sys.argv[0] + " delete FILE|ID")
confirm = tools.rawInput("Are you sure you want to delete " + confirm = tools.rawInput("Are you sure you want to delete " +
sys.argv[2]+" ? [y/N] ") sys.argv[2]+"? [y/N] ")
if confirm.lower() == 'y': if confirm.lower() == 'y':
if not backend.deleteId(sys.argv[2]): if not backend.deleteId(sys.argv[2]):
@ -263,8 +286,8 @@ if __name__ == '__main__':
elif sys.argv[1] == 'resync': elif sys.argv[1] == 'resync':
if len(sys.argv) > 2 and sys.argv[2] == 'help': if len(sys.argv) > 2 and sys.argv[2] == 'help':
sys.exit("Usage : " + sys.argv[0] + " resync") sys.exit("Usage: " + sys.argv[0] + " resync")
confirm = tools.rawInput("Resync files and bibtex index ? [y/N] ") confirm = tools.rawInput("Resync files and bibtex index? [y/N] ")
if confirm.lower() == 'y': if confirm.lower() == 'y':
resync() resync()