uma-tts-api/text/cleaners.py
2024-11-21 10:49:06 +09:00

25 lines
1021 B
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from unidecode import unidecode
import pyopenjtalk
# Regular expression matching Japanese without punctuation marks:
_japanese_characters = re.compile(r'[A-Za-z\d々぀-ヿ一-鿿1---zヲ-ン]')
# Regular expression matching non-Japanese characters or punctuation marks:
_japanese_marks = re.compile(r'[^A-Za-z\d々぀-ヿ一-鿿1---zヲ-ン]')
# List of (regular expression, replacement) pairs for abbreviations:
def japanese_cleaners(text):
sentences = re.split(_japanese_marks, text)
marks = re.findall(_japanese_marks, text)
text = ''
for i, mark in enumerate(marks):
if re.match(_japanese_characters, sentences[i]):
text += pyopenjtalk.g2p(sentences[i], kana=False).replace('pau','').replace(' ','')
text += unidecode(mark).replace(' ','')
if re.match(_japanese_characters, sentences[-1]):
text += pyopenjtalk.g2p(sentences[-1], kana=False).replace('pau','').replace(' ','')
if re.match('[A-Za-z]',text[-1]):
text += '.'
return text