Files
youtube-dl/lib/support/base/string.py
2025-12-25 19:42:32 +09:00

85 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
import traceback
from . import logger
class SupportString(object):
@classmethod
def get_cate_char_by_first(cls, title): # get_first
value = ord(title[0].upper())
if ord('') <= value < ord(''): return ''
if ord('') <= value < ord(''): return ''
if ord('') <= value < ord(''): return ''
if ord('') <= value < ord(''): return ''
if ord('') <= value < ord(''): return ''
if ord('') <= value < ord(''): return ''
if ord('') <= value < ord(''): return ''
if ord('') <= value < ord(''): return ''
if ord('') <= value < ord(''): return ''
if ord('') <= value < ord(''): return ''
if ord('') <= value < ord(''): return ''
if ord('') <= value < ord(''): return ''
if ord('') <= value < ord(''): return ''
if ord('') <= value < ord(''): return ''
return '0Z'
@classmethod
def is_include_hangul(cls, text):
try:
hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', text))
return hanCount > 0
except:
return False
@classmethod
def language_info(cls, text):
try:
text = text.strip().replace(' ', '')
all_count = len(text)
han_count = len(re.findall('[\u3130-\u318F\uAC00-\uD7A3]', text))
eng_count = len(re.findall('[a-zA-Z]', text))
etc_count = len(re.findall('[0-9]', text))
etc_count += len(re.findall('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\|\(\)\[\]\<\>`\'…》:]', text))
if all_count == etc_count:
return (0,0)
han_percent = int(han_count * 100 / (all_count-etc_count))
eng_percent = int(eng_count * 100 / (all_count-etc_count))
return (han_percent, eng_percent)
except Exception as e:
logger.error(f"Exception:{str(e)}")
logger.error(traceback.format_exc())
return False
@classmethod
def remove_special_char(cls, text):
return re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\|\(\)\[\]\<\>`\'…》:]', '', text)
@classmethod
def remove_emoji(cls, text, char=''):
import re
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
#u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", flags=re.UNICODE)
# Remove emojis from the text
text = emoji_pattern.sub(char, text)
return text