25 lines
1021 B
Python
25 lines
1021 B
Python
import re
|
||
from unidecode import unidecode
|
||
import pyopenjtalk
|
||
|
||
# Regular expression matching Japanese without punctuation marks:
|
||
_japanese_characters = re.compile(r'[A-Za-z\d々-ヿ一-鿿1-9A-Za-zヲ-ン]')
|
||
|
||
# Regular expression matching non-Japanese characters or punctuation marks:
|
||
_japanese_marks = re.compile(r'[^A-Za-z\d々-ヿ一-鿿1-9A-Za-zヲ-ン]')
|
||
|
||
# List of (regular expression, replacement) pairs for abbreviations:
|
||
|
||
def japanese_cleaners(text):
|
||
sentences = re.split(_japanese_marks, text)
|
||
marks = re.findall(_japanese_marks, text)
|
||
text = ''
|
||
for i, mark in enumerate(marks):
|
||
if re.match(_japanese_characters, sentences[i]):
|
||
text += pyopenjtalk.g2p(sentences[i], kana=False).replace('pau','').replace(' ','')
|
||
text += unidecode(mark).replace(' ','')
|
||
if re.match(_japanese_characters, sentences[-1]):
|
||
text += pyopenjtalk.g2p(sentences[-1], kana=False).replace('pau','').replace(' ','')
|
||
if re.match('[A-Za-z]',text[-1]):
|
||
text += '.'
|
||
return text |