import re from unidecode import unidecode import pyopenjtalk # Regular expression matching Japanese without punctuation marks: _japanese_characters = re.compile(r'[A-Za-z\d々぀-ヿ一-鿿1-9A-Za-zヲ-ン]') # Regular expression matching non-Japanese characters or punctuation marks: _japanese_marks = re.compile(r'[^A-Za-z\d々぀-ヿ一-鿿1-9A-Za-zヲ-ン]') # List of (regular expression, replacement) pairs for abbreviations: def japanese_cleaners(text): sentences = re.split(_japanese_marks, text) marks = re.findall(_japanese_marks, text) text = '' for i, mark in enumerate(marks): if re.match(_japanese_characters, sentences[i]): text += pyopenjtalk.g2p(sentences[i], kana=False).replace('pau','').replace(' ','') text += unidecode(mark).replace(' ','') if re.match(_japanese_characters, sentences[-1]): text += pyopenjtalk.g2p(sentences[-1], kana=False).replace('pau','').replace(' ','') if re.match('[A-Za-z]',text[-1]): text += '.' return text