2015-12-27 19:35:55 +01:00
"""
This file contains various utility functions .
"""
2015-12-27 23:46:43 +01:00
import re
2016-01-20 21:42:17 +01:00
import unicodedata
2015-12-27 23:46:43 +01:00
from itertools import islice , chain
# Huge URL regex taken from https://gist.github.com/gruber/8891611
URL_REGEX = re . compile ( r " (?i) \ b((?:https?:(?:/ { 1,3}|[a-z0-9 % ])|[a-z0-9. \ -]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^ \ s()<> {} \ [ \ ]]+| \ ([^ \ s()]*? \ ([^ \ s()]+ \ )[^ \ s()]*? \ )| \ ([^ \ s]+? \ ))+(?: \ ([^ \ s()]*? \ ([^ \ s()]+ \ )[^ \ s()]*? \ )| \ ([^ \ s]+? \ )|[^ \ s`!() \ [ \ ] {} ;: ' \" .,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[. \ -][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw) \ b/?(?!@))) " )
2015-12-27 19:35:55 +01:00
def replaceAll ( text , replace_dict ) :
"""
Replace multiple strings in a text .
2016-01-20 21:42:17 +01:00
. . note : :
Replacements are made successively , without any warranty on the order \
in which they are made .
2015-12-27 19:35:55 +01:00
: param text : Text to replace in .
: param replace_dict : Dictionary mapping strings to replace with their \
substitution .
: returns : Text after replacements .
2016-01-20 21:42:17 +01:00
>> > replaceAll ( " foo bar foo thing " , { " foo " : " oof " , " bar " : " rab " } )
' oof rab oof thing '
2015-12-27 19:35:55 +01:00
"""
for i , j in replace_dict . items ( ) :
text = text . replace ( i , j )
return text
def clean_whitespaces ( text ) :
"""
2015-12-27 23:46:43 +01:00
Remove multiple whitespaces from text . Also removes leading and trailing \
whitespaces .
2015-12-27 19:35:55 +01:00
: param text : Text to remove multiple whitespaces from .
: returns : A cleaned text .
2016-01-20 21:42:17 +01:00
>> > clean_whitespaces ( " this is a text with spaces " )
' this is a text with spaces '
2015-12-27 19:35:55 +01:00
"""
return ' ' . join ( text . strip ( ) . split ( ) )
def remove_duplicates ( some_list ) :
"""
Remove the duplicates from a list .
: param some_list : List to remove duplicates from .
: returns : A list without duplicates .
2016-01-20 21:42:17 +01:00
>> > remove_duplicates ( [ 1 , 2 , 3 , 1 ] )
[ 1 , 2 , 3 ]
>> > remove_duplicates ( [ 1 , 2 , 1 , 2 ] )
[ 1 , 2 ]
2015-12-27 19:35:55 +01:00
"""
return list ( set ( some_list ) )
2015-12-27 23:46:43 +01:00
def batch ( iterable , size ) :
"""
Get items from a sequence a batch at a time .
. . note :
Adapted from
https : / / code . activestate . com / recipes / 303279 - getting - items - in - batches / .
. . note :
All batches must be exhausted immediately .
: params iterable : An iterable to get batches from .
: params size : Size of the batches .
: returns : A new batch of the given size at each time .
2016-01-20 21:42:17 +01:00
# TODO: Unittest
2015-12-27 23:46:43 +01:00
"""
it = iter ( iterable )
while True :
bi = islice ( it , size )
yield chain ( [ bi . next ( ) ] , bi )
def remove_URLs ( text ) :
"""
Remove URLs from a given text ( only removes http , https and naked domains \
URLs ) .
: param text : The text to remove URLs from .
: returns : The text without URLs .
2016-01-20 21:42:17 +01:00
>> > remove_URLs ( " foobar http://example.com https://example.com foobar " )
' foobar foobar '
2015-12-27 23:46:43 +01:00
"""
return clean_whitespaces ( URL_REGEX . sub ( " " , text ) )
2016-01-10 14:44:27 +01:00
_slugify_strip_re = re . compile ( r ' [^ \ w \ s-] ' )
_slugify_hyphenate_re = re . compile ( r ' [ \ s]+ ' )
def slugify ( value ) :
"""
Normalizes string , converts to lowercase , removes non - alpha characters ,
and converts spaces to hyphens to have nice filenames .
From Django ' s " django/template/defaultfilters.py " .
2016-01-20 21:42:17 +01:00
>> > slugify ( " El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba a su querido cachorro. ortez ce vieux whisky au juge blond qui fume sur son île intérieure, à Γαζέες καὶ μυρτιὲς δὲν θὰ βρῶ πιὰ στὸ χρυσαφὶ ξέφωτο いろはにほへとちりぬるを Pchnąć w tę łódź jeża lub ośm skrzyń fig กว่าบรรดาฝูงสัตว์เดรัจฉาน " )
' El_pinguino_Wenceslao_hizo_kilometros_bajo_exhaustiva_lluvia_y_frio_anoraba_a_su_querido_cachorro_ortez_ce_vieux_whisky_au_juge_blond_qui_fume_sur_son_ile_interieure_a_Pchnac_w_te_odz_jeza_lub_osm_skrzyn_fig '
2016-01-10 14:44:27 +01:00
"""
try :
unicode_type = unicode
except NameError :
unicode_type = str
if not isinstance ( value , unicode_type ) :
value = unicode_type ( value )
value = ( unicodedata . normalize ( ' NFKD ' , value ) .
encode ( ' ascii ' , ' ignore ' ) . decode ( ' ascii ' ) )
value = unicode_type ( _slugify_strip_re . sub ( ' ' , value ) . strip ( ) )
return _slugify_hyphenate_re . sub ( ' _ ' , value )