youtube2local/helpers/title.py

103 lines
2.9 KiB
Python
Raw Normal View History

2016-06-09 18:31:39 +02:00
"""
Helper to clean Youtube titles, removing usual junk.
"""
import re
def find_separator(str):
"""
Find a common separators used in Youtube titles to separate artist and
track name.
"""
separators = [' -- ', ' - ', ' ', '', '///', '']
if len(str) == 0:
return None
for sep in separators:
index = str.find(sep)
if index > -1:
return {
"index": index,
"length": len(sep)
}
return None
def split(yt_title):
"""
Split a title according to found separator.
"""
# Find separator
separator = find_separator(yt_title)
if separator is None or len(yt_title) == 0:
return {
"artist": None,
"title": None
}
# Split artist and title
artist = yt_title[0:separator["index"]]
title = yt_title[separator["index"] + separator["length"]:]
# Do some cleanup
artist = clean(artist)
title = clean(title)
return {
"artist": artist,
"title": title
}
def clean(title):
"""
Remove usual junk from a Youtube title.
"""
title = re.sub(r"/^\s+|\s+$/g", '', title)
# **NEW**
title = re.sub(r"/\s*\*+\s?\S+\s?\*+$/", '', title)
# [whatever]
title = re.sub(r"/\s*\[[^\]]+\]$/", '', title)
# (whatever version)
title = re.sub(r"/\s*\([^\)]*version\)$/i", '', title)
# video extensions
title = re.sub(r"/\s*\.(avi|wmv|mpg|mpeg|flv)$/i", '', title)
# (LYRIC VIDEO)
title = re.sub(r"/\s*(LYRIC VIDEO\s*)?(lyric video\s*)/i", '', title)
# (Official title Stream)
title = re.sub(r"/\s*(Official title Stream*)/i", '', title)
# (official)? (music)? video
title = re.sub(r"/\s*(of+icial\s*)?(music\s*)?video/i", '', title)
# (official)? (music)? audio
title = re.sub(r"/\s*(of+icial\s*)?(music\s*)?audio/i", '', title)
# (ALBUM title)
title = re.sub(r"/\s*(ALBUM title\s*)?(album title\s*)/i", '', title)
# (Cover Art)
title = re.sub(r"/\s*(COVER ART\s*)?(Cover Art\s*)/i", '', title)
# (official)
title = re.sub(r"/\s*\(\s*of+icial\s*\)/i", '', title)
# (1999)
title = re.sub(r"/\s*\(\s*[0-9]{4}\s*\)/i", '', title)
# HD (HQ)
title = re.sub(r"/\s+\(\s*(HD|HQ)\s*\)$/", '', title)
# HD (HQ)
title = re.sub(r"/\s+(HD|HQ)\s*$/", '', title)
# video clip
title = re.sub(r"/\s*video\s*clip/i", '', title)
# Full Album
title = re.sub(r"/\s*full\s*album/i", '', title)
# live
title = re.sub(r"/\s+\(?live\)?$/i", '', title)
# Leftovers after e.g. (official video)
title = re.sub(r"/\(+\s*\)+/", '', title)
# Artist - The new "title title" featuring someone
title = re.sub(r"/^(|.*\s)\"(.*)\"(\s.*|)$/", '\2', title)
# 'title title'
title = re.sub(r"/^(|.*\s)'(.*)'(\s.*|)$/", '\2', title)
# trim white chars and dash
title.lstrip(" \t\n\r-")
title.rstrip(" \t\n\r-")
return title