Joeyos's Blog Software Engineer

Java将汉字字符串转换为拼音

2019-01-17
Quan Zhang

汉字转换为拼音

用一种简单的方法将字符串转化为拼音:

  1. 将需要转换的字符串t1里的字符t1[i]按照t3的格式格式化为拼音,并复制给t2
  2. 如果t1[i]不是汉字,则不转换,直接把t1[i]复制给t2
  3. 将t2首字母大写,复制给t4

这里将用到pinyin4j.jar包,请自行百度下载。

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

public class Chinese2Pinyin {
	public static String getPinyin(String src) {
		char[] t1 = null;
		t1 = src.toCharArray();
		String[] t2 = new String[t1.length];
		HanyuPinyinOutputFormat t3 = new HanyuPinyinOutputFormat();
		t3.setCaseType(HanyuPinyinCaseType.LOWERCASE);// 小写格式
		t3.setToneType(HanyuPinyinToneType.WITHOUT_TONE);// 有无音标
		t3.setVCharType(HanyuPinyinVCharType.WITH_V);
		String t4 = "";
		try {
			for (int i = 0; i < t1.length; i++) {
				// 判断是否为汉字字符
				// if(t1[i] >= 32 && t1[i] <= 125)//ASCII码表范围内直接返回
				if (String.valueOf(t1[i]).matches("[\\u4E00-\\u9FA5]+")) {
					t2 = PinyinHelper.toHanyuPinyinStringArray(t1[i], t3);// 转化为拼音
					t4 += t2[0].substring(0, 1).toUpperCase() + t2[0].substring(1);// 首字母大写
				} else {
					t4 += String.valueOf(t1[i]);// 不是汉字不处理
				}
			}
		} catch (BadHanyuPinyinOutputFormatCombination e1) {
			e1.printStackTrace();
		}
		return t4;
	}

	public static void main(String[] args) {
		String s = getPinyin("西安电子科技大学");
		System.out.println(s);
		String s1 = getPinyin("西安");
		System.out.println(s1);
		String s2 = getPinyin("成都");
		System.out.println(s2);// ChengDu
	}
}

运行结果为:

XiAnDianZiKeJiDaXue
XiAn
ChengDou

可以看到,“成都(du)”被转化成了“ChengDou”,这是因为pinyin4j.jar无法识别多音字的原因。如果是多音字,t2[k]存放着某个字符t1[i]的多个拼音,是一个数组,默认取t2[0]为该字的拼音。

多音字处理

这里采用查字典的方法,将需要转换的字符子串与字典里的词组进行匹配,如果匹配到,这把字典里的读音作为改字的拼音。单字随机分配发音,如果是字典里的常用单字,则按照字典发音。 首先,下载多音字字典文件:下载字典

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang.StringUtils;

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

public class Chinese2Pinyin {
	public static Map<String, String> dictionary = new HashMap<String, String>();
	static String filePath = "C:\\dict\\duoyinzi_pinyin.txt";
	// 加载多音字词典
	static {
		BufferedReader br = null;
		try {
			File file = new File(filePath);
			br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
			String line = null;
			while ((line = br.readLine()) != null) {
				String[] arr = line.split("#");
				if (StringUtils.isNotEmpty(arr[1])) {
					String[] sems = arr[1].split(" ");
					for (String sem : sems) {
						if (StringUtils.isNotEmpty(sem)) {
							dictionary.put(sem, arr[0]);
						}
					}
				}
			}
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (br != null) {
				try {
					br.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}

	public static String getPinyin(String src) {
		char[] t1 = null;
		t1 = src.toCharArray();
		String[] t2 = new String[t1.length];
		HanyuPinyinOutputFormat t3 = new HanyuPinyinOutputFormat();
		t3.setCaseType(HanyuPinyinCaseType.LOWERCASE);// 小写格式
		t3.setToneType(HanyuPinyinToneType.WITHOUT_TONE);// 有无音标
		t3.setVCharType(HanyuPinyinVCharType.WITH_V);
		String t4 = "";
		try {
			for (int i = 0; i < t1.length; i++) {
				// 判断是否为汉字字符
				// if(t1[i] >= 32 && t1[i] <= 125)//ASCII码表范围内直接返回
				if (String.valueOf(t1[i]).matches("[\\u4E00-\\u9FA5]+")) {
					t2 = PinyinHelper.toHanyuPinyinStringArray(t1[i], t3);// 转化为拼音
					//如果是单个汉字,不处理,随机分配拼音
					if (i != t1.length - 1 && t1.length != 1) {
						String dic = String.valueOf(t1[i]) + String.valueOf(t1[i + 1]);
						for (String py : t2) {
							if (py.equals(dictionary.get(dic)) || py.equals(dictionary.get(String.valueOf(t1[i])))) {
								t2[0] = py;// 把t2[0]作为存放正确发音的区域
								break;
							}
						}
					} else if (t1.length != 1) {
						String dic = String.valueOf(t1[i - 1]) + String.valueOf(t1[i]);
						for (String py : t2) {
							if (py.equals(dictionary.get(dic)) || py.equals(dictionary.get(String.valueOf(t1[i])))) {
								t2[0] = py;// 把t2[0]作为存放正确发音的区域
								break;
							}
						}
					}
					t4 += t2[0].substring(0, 1).toUpperCase() + t2[0].substring(1);// 首字母大写
				} else {
					t4 += String.valueOf(t1[i]);// 不是汉字不处理
				}
			}
		} catch (BadHanyuPinyinOutputFormatCombination e1) {
			e1.printStackTrace();
		}
		return t4;
	}

	public static void main(String[] args) {
		String s = getPinyin("西安电子科技大学");
		System.out.println(s);
		String s1 = getPinyin("西安");
		System.out.println(s1);
		String s2 = getPinyin("成都");
		System.out.println(s2);// ChengDu
	}
}

运行结果为:

XiAnDianZiKeJiDaXue
XiAn
ChengDu

Similar Posts

Comments