103 lines
2.9 KiB
Python
103 lines
2.9 KiB
Python
|
"""
|
|||
|
Helper to clean Youtube titles, removing usual junk.
|
|||
|
"""
|
|||
|
import re
|
|||
|
|
|||
|
|
|||
|
def find_separator(str):
|
|||
|
"""
|
|||
|
Find a common separators used in Youtube titles to separate artist and
|
|||
|
track name.
|
|||
|
"""
|
|||
|
separators = [' -- ', ' - ', ' – ', ' — ', '///', '►']
|
|||
|
if len(str) == 0:
|
|||
|
return None
|
|||
|
|
|||
|
for sep in separators:
|
|||
|
index = str.find(sep)
|
|||
|
if index > -1:
|
|||
|
return {
|
|||
|
"index": index,
|
|||
|
"length": len(sep)
|
|||
|
}
|
|||
|
|
|||
|
return None
|
|||
|
|
|||
|
|
|||
|
def split(yt_title):
|
|||
|
"""
|
|||
|
Split a title according to found separator.
|
|||
|
"""
|
|||
|
# Find separator
|
|||
|
separator = find_separator(yt_title)
|
|||
|
if separator is None or len(yt_title) == 0:
|
|||
|
return {
|
|||
|
"artist": None,
|
|||
|
"title": None
|
|||
|
}
|
|||
|
|
|||
|
# Split artist and title
|
|||
|
artist = yt_title[0:separator["index"]]
|
|||
|
title = yt_title[separator["index"] + separator["length"]:]
|
|||
|
|
|||
|
# Do some cleanup
|
|||
|
artist = clean(artist)
|
|||
|
title = clean(title)
|
|||
|
|
|||
|
return {
|
|||
|
"artist": artist,
|
|||
|
"title": title
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
def clean(title):
|
|||
|
"""
|
|||
|
Remove usual junk from a Youtube title.
|
|||
|
"""
|
|||
|
title = re.sub(r"/^\s+|\s+$/g", '', title)
|
|||
|
# **NEW**
|
|||
|
title = re.sub(r"/\s*\*+\s?\S+\s?\*+$/", '', title)
|
|||
|
# [whatever]
|
|||
|
title = re.sub(r"/\s*\[[^\]]+\]$/", '', title)
|
|||
|
# (whatever version)
|
|||
|
title = re.sub(r"/\s*\([^\)]*version\)$/i", '', title)
|
|||
|
# video extensions
|
|||
|
title = re.sub(r"/\s*\.(avi|wmv|mpg|mpeg|flv)$/i", '', title)
|
|||
|
# (LYRIC VIDEO)
|
|||
|
title = re.sub(r"/\s*(LYRIC VIDEO\s*)?(lyric video\s*)/i", '', title)
|
|||
|
# (Official title Stream)
|
|||
|
title = re.sub(r"/\s*(Official title Stream*)/i", '', title)
|
|||
|
# (official)? (music)? video
|
|||
|
title = re.sub(r"/\s*(of+icial\s*)?(music\s*)?video/i", '', title)
|
|||
|
# (official)? (music)? audio
|
|||
|
title = re.sub(r"/\s*(of+icial\s*)?(music\s*)?audio/i", '', title)
|
|||
|
# (ALBUM title)
|
|||
|
title = re.sub(r"/\s*(ALBUM title\s*)?(album title\s*)/i", '', title)
|
|||
|
# (Cover Art)
|
|||
|
title = re.sub(r"/\s*(COVER ART\s*)?(Cover Art\s*)/i", '', title)
|
|||
|
# (official)
|
|||
|
title = re.sub(r"/\s*\(\s*of+icial\s*\)/i", '', title)
|
|||
|
# (1999)
|
|||
|
title = re.sub(r"/\s*\(\s*[0-9]{4}\s*\)/i", '', title)
|
|||
|
# HD (HQ)
|
|||
|
title = re.sub(r"/\s+\(\s*(HD|HQ)\s*\)$/", '', title)
|
|||
|
# HD (HQ)
|
|||
|
title = re.sub(r"/\s+(HD|HQ)\s*$/", '', title)
|
|||
|
# video clip
|
|||
|
title = re.sub(r"/\s*video\s*clip/i", '', title)
|
|||
|
# Full Album
|
|||
|
title = re.sub(r"/\s*full\s*album/i", '', title)
|
|||
|
# live
|
|||
|
title = re.sub(r"/\s+\(?live\)?$/i", '', title)
|
|||
|
# Leftovers after e.g. (official video)
|
|||
|
title = re.sub(r"/\(+\s*\)+/", '', title)
|
|||
|
# Artist - The new "title title" featuring someone
|
|||
|
title = re.sub(r"/^(|.*\s)\"(.*)\"(\s.*|)$/", '\2', title)
|
|||
|
# 'title title'
|
|||
|
title = re.sub(r"/^(|.*\s)'(.*)'(\s.*|)$/", '\2', title)
|
|||
|
# trim white chars and dash
|
|||
|
title.lstrip(" \t\n\r-")
|
|||
|
title.rstrip(" \t\n\r-")
|
|||
|
|
|||
|
return title
|