维吾尔语基本区转换扩展区 · 开发者必备的知识

~~~ <?php /** * info: 维吾尔语文字处理 * Date: 2016/08/12 * Time: 15:51 */ class Uyghur_Convert_Gd_v3 { private static $uyghur_harp = array("ئ", "ب", "پ", "ت", "ج", "چ", "خ", "س", "ش", "غ", "ف", "ق", "ك", "گ", "ڭ", "ل", "م", "ن", "ھ", "ې", "ى", "ي"); private static $uyghur_all = array("ئ", "ا", "ب", "ە", "پ", "ت", "ج", "چ", "خ", "د", "ر", "ز", "ژ", "س", "ش", "غ", "ف", "ق", "ك", "گ", "ڭ", "ل", "م", "ن", "ھ", "و", "ۇ", "ۆ", "ۈ", "ۋ", "ې", "ى", "ي"); private static $letters = array( "ا" => array("character" => "ا", "isoGlyph" => "ﺍ", "iniGlyph" => "ﺍ", "midGlyph" => "ﺎ", "endGlyph" => "ﺎ"), "ە" => array("character" => "ە", "isoGlyph" => "ﻩ", "iniGlyph" => "ﻩ", "midGlyph" => "ﻩ", "endGlyph" => "ﻪ"), "ب" => array("character" => "ب", "isoGlyph" => "ﺏ", "iniGlyph" => "ﺑ", "midGlyph" => "ﺒ", "endGlyph" => "ﺐ"), "پ" => array("character" => "پ", "isoGlyph" => "ﭖ", "iniGlyph" => "ﭘ", "midGlyph" => "ﭙ", "endGlyph" => "ﭗ"), "ت" => array("character" => "ت", "isoGlyph" => "ﺕ", "iniGlyph" => "ﺗ", "midGlyph" => "ﺘ", "endGlyph" => "ﺖ"), "ج" => array("character" => "ج", "isoGlyph" => "ﺝ", "iniGlyph" => "ﺟ", "midGlyph" => "ﺠ", "endGlyph" => "ﺞ"), "چ" => array("character" => "چ", "isoGlyph" => "ﭺ", "iniGlyph" => "ﭼ", "midGlyph" => "ﭽ", "endGlyph" => "ﭻ"), "خ" => array("character" => "خ", "isoGlyph" => "ﺥ", "iniGlyph" => "ﺧ", "midGlyph" => "ﺨ", "endGlyph" => "ﺦ"), "د" => array("character" => "د", "isoGlyph" => "ﺩ", "iniGlyph" => "ﺩ", "midGlyph" => "ﺪ", "endGlyph" => "ﺪ"), "ر" => array("character" => "ر", "isoGlyph" => "ﺭ", "iniGlyph" => "ﺭ", "midGlyph" => "ﺮ", "endGlyph" => "ﺮ"), "ز" => array("character" => "ز", "isoGlyph" => "ﺯ", "iniGlyph" => "ﺯ", "midGlyph" => "ﺰ", "endGlyph" => "ﺰ"), "ژ" => array("character" => "ژ", "isoGlyph" => "ﮊ", "iniGlyph" => "ﮊ", "midGlyph" => "ﮋ", "endGlyph" => "ﮋ"), "س" => array("character" => "س", "isoGlyph" => "ﺱ", "iniGlyph" => "ﺳ", "midGlyph" => "ﺴ", "endGlyph" => "ﺲ"), "ش" => array("character" => "ش", "isoGlyph" => "ﺵ", "iniGlyph" => "ﺷ", "midGlyph" => "ﺸ", "endGlyph" => "ﺶ"), "غ" => array("character" => "غ", "isoGlyph" => "ﻍ", "iniGlyph" => "ﻏ", "midGlyph" => "ﻐ", "endGlyph" => "ﻎ"), "ق" => array("character" => "ق", "isoGlyph" => "ﻕ", "iniGlyph" => "ﻗ", "midGlyph" => "ﻘ", "endGlyph" => "ﻖ"), "ف" => array("character" => "ف", "isoGlyph" => "ﻑ", "iniGlyph" => "ﻓ", "midGlyph" => "ﻔ", "endGlyph" => "ﻒ"), "ك" => array("character" => "ك", "isoGlyph" => "ﻙ", "iniGlyph" => "ﻛ", "midGlyph" => "ﻜ", "endGlyph" => "ﻚ"), "گ" => array("character" => "گ", "isoGlyph" => "ﮒ", "iniGlyph" => "ﮔ", "midGlyph" => "ﮕ", "endGlyph" => "ﮓ"), "ڭ" => array("character" => "ڭ", "isoGlyph" => "ﯓ", "iniGlyph" => "ﯕ", "midGlyph" => "ﯖ", "endGlyph" => "ﯔ"), "ل" => array("character" => "ل", "isoGlyph" => "ﻝ", "iniGlyph" => "ﻟ", "midGlyph" => "ﻠ", "endGlyph" => "ﻞ"), "م" => array("character" => "م", "isoGlyph" => "ﻡ", "iniGlyph" => "ﻣ", "midGlyph" => "ﻤ", "endGlyph" => "ﻢ"), "ن" => array("character" => "ن", "isoGlyph" => "ﻥ", "iniGlyph" => "ﻧ", "midGlyph" => "ﻨ", "endGlyph" => "ﻦ"), "ھ" => array("character" => "ھ", "isoGlyph" => "ﮪ", "iniGlyph" => "ﮪ", "midGlyph" => "ﮭ", "endGlyph" => "ﮭ"), "و" => array("character" => "و", "isoGlyph" => "ﻭ", "iniGlyph" => "ﻭ", "midGlyph" => "ﻮ", "endGlyph" => "ﻮ"), "ۇ" => array("character" => "ۇ", "isoGlyph" => "ﯗ", "iniGlyph" => "ﯗ", "midGlyph" => "ﯘ", "endGlyph" => "ﯘ"), "ۆ" => array("character" => "ۆ", "isoGlyph" => "ﯙ", "iniGlyph" => "ﯙ", "midGlyph" => "ﯚ", "endGlyph" => "ﯚ"), "ۈ" => array("character" => "ۈ", "isoGlyph" => "ﯛ", "iniGlyph" => "ﯛ", "midGlyph" => "ﯜ", "endGlyph" => "ﯜ"), "ۋ" => array("character" => "ۋ", "isoGlyph" => "ﯞ", "iniGlyph" => "ﯞ", "midGlyph" => "ﯟ", "endGlyph" => "ﯟ"), "ې" => array("character" => "ې", "isoGlyph" => "ﯤ", "iniGlyph" => "ﯦ", "midGlyph" => "ﯧ", "endGlyph" => "ﯥ"), "ى" => array("character" => "ى", "isoGlyph" => "ﻯ", "iniGlyph" => "ﯨ", "midGlyph" => "ﯩ", "endGlyph" => "ﻰ"), "ي" => array("character" => "ي", "isoGlyph" => "ﻱ", "iniGlyph" => "ﻳ", "midGlyph" => "ﻴ", "endGlyph" => "ﻲ"), "ئ" => array("character" => "ئ", "isoGlyph" => "ﺋ", "iniGlyph" => "ﺋ", "midGlyph" => "ﺌ", "endGlyph" => "ﺌ"), ); /** * 扩展区转 * @param $text * @return mixed */ private static function asasiy($text) { $text = str_replace(array('ا', 'ﺍ', 'ﺍ', 'ﺎ', 'ﺎ'), "ا", $text); $text = str_replace(array('ە', 'ﻩ', 'ﻩ', 'ﻩ', 'ﻪ'), "ە", $text); $text = str_replace(array('ب', 'ﺏ', 'ﺑ', 'ﺒ', 'ﺐ'), "ب", $text); $text = str_replace(array('پ', 'ﭖ', 'ﭘ', 'ﭙ', 'ﭗ'), "پ", $text); $text = str_replace(array('ت', 'ﺕ', 'ﺗ', 'ﺘ', 'ﺖ'), "ت", $text); $text = str_replace(array('ج', 'ﺝ', 'ﺟ', 'ﺠ', 'ﺞ'), "ج", $text); $text = str_replace(array('چ', 'ﭺ', 'ﭼ', 'ﭽ', 'ﭻ'), "چ", $text); $text = str_replace(array('خ', 'ﺥ', 'ﺧ', 'ﺨ', 'ﺦ'), "خ", $text); $text = str_replace(array('د', 'ﺩ', 'ﺩ', 'ﺪ', 'ﺪ'), "د", $text); $text = str_replace(array('ر', 'ﺭ', 'ﺭ', 'ﺮ', 'ﺮ'), "ر", $text); $text = str_replace(array('ز', 'ﺯ', 'ﺯ', 'ﺰ', 'ﺰ'), "ز", $text); $text = str_replace(array('ژ', 'ﮊ', 'ﮊ', 'ﮋ', 'ﮋ'), "ژ", $text); $text = str_replace(array('س', 'ﺱ', 'ﺳ', 'ﺴ', 'ﺲ'), "س", $text); $text = str_replace(array('ش', 'ﺵ', 'ﺷ', 'ﺸ', 'ﺶ'), "ش", $text); $text = str_replace(array('غ', 'ﻍ', 'ﻏ', 'ﻐ', 'ﻎ'), "غ", $text); $text = str_replace(array('ق', 'ﻕ', 'ﻗ', 'ﻘ', 'ﻖ'), "ق", $text); $text = str_replace(array('ف', 'ﻑ', 'ﻓ', 'ﻔ', 'ﻒ'), "ف", $text); $text = str_replace(array('ك', 'ﻙ', 'ﻛ', 'ﻜ', 'ﻚ'), "ك", $text); $text = str_replace(array('گ', 'ﮒ', 'ﮔ', 'ﮕ', 'ﮓ'), "گ", $text); $text = str_replace(array('ڭ', 'ﯓ', 'ﯕ', 'ﯖ', 'ﯔ'), "ڭ", $text); $text = str_replace(array('ل', 'ﻝ', 'ﻟ', 'ﻠ', 'ﻞ'), "ل", $text); $text = str_replace(array('م', 'ﻡ', 'ﻣ', 'ﻤ', 'ﻢ'), "م", $text); $text = str_replace(array('ن', 'ﻥ', 'ﻧ', 'ﻨ', 'ﻦ'), "ن", $text); $text = str_replace(array('ھ', 'ﮪ', 'ﮪ', 'ﮭ', 'ﮭ'), "ھ", $text); $text = str_replace(array('و', 'ﻭ', 'ﻭ', 'ﻮ', 'ﻮ'), "و", $text); $text = str_replace(array('ۇ', 'ﯗ', 'ﯗ', 'ﯘ', 'ﯘ'), "ۇ", $text); $text = str_replace(array('ۆ', 'ﯙ', 'ﯙ', 'ﯚ', 'ﯚ'), "ۆ", $text); $text = str_replace(array('ۈ', 'ﯛ', 'ﯛ', 'ﯜ', 'ﯜ'), "ۈ", $text); $text = str_replace(array('ۋ', 'ﯞ', 'ﯞ', 'ﯟ', 'ﯟ'), "ۋ", $text); $text = str_replace(array('ې', 'ﯤ', 'ﯦ', 'ﯧ', 'ﯥ'), "ې", $text); $text = str_replace(array('ى', 'ﻯ', 'ﯨ', 'ﯩ', 'ﻰ'), "ى", $text); $text = str_replace(array('ي', 'ﻱ', 'ﻳ', 'ﻴ', 'ﻲ'), "ي", $text); $text = str_replace(array('ئ', 'ﺋ', 'ﺋ', 'ﺌ', 'ﺌ'), "ئ", $text); $text = str_replace(array('ﻻ', 'ﻼ'), "لا", $text); $text = str_replace('：', " : ", $text); $text = str_replace('…', "...", $text); return $text; } /** * Gets the current character from a UTF-8 string * * Returns a substitution character if the first byte is invalid. * Expecting a valid UTF-8 string. Does not check if the bytes following * the first one are valid. * * @param string $string the UTF-8 string * @param integer &$pos the current byte position within the UTF-8 string, * the position is updated to the next character on exit * @param integer $length the length of the UTF-8 string * @param boolean $lookahead update the position to the next UTF-8 character * if true, leaves it unchanged if true * @param string $invalid the ASCII character replacing an invalid byte, e.g. "?", * invalid bytes are silently ignored if null * @return string the UTF-8 character, or false if there are * no more characters to get * @access public */ private static function getChar($string, &$pos, $length, $lookahead = false, $invalid = '?') { if ($pos >= $length) { // no more character to read return false; } // saves the current character position if lookahead $lookahead and $copy = $pos; // gets the first byte $char = $string{$pos++}; if ($char < "\x80") { // a 1-byte character } else if ($char < "\xC0") { // error: invalid as a first byte $char = $invalid; } else if ($char < "\xE0") { // a 2-byte character $char .= $string{$pos++}; } else if ($char < "\xF0") { // a 3-byte character $char .= substr($string, $pos, 2); $pos += 2; } else if ($char < "\xF8") { // a 4-byte character $char .= substr($string, $pos, 3); $pos += 3; } else { // error: out of range as a first byte $char = $invalid; } // restores the current character position if lookahead $lookahead and $pos = $copy; return $char; } /** * @param $currentChar * @param $beforeChar * @return bool */ private static function checkLinkBefore($currentChar, $beforeChar) { return in_array($beforeChar, self::$uyghur_harp); } /** * @param $currentChar * @param $afterChar * @return bool */ private static function checkLinkAfter($currentChar, $afterChar) { if (strcmp($currentChar, "ە") == 0) { return false; } return in_array($afterChar, self::$uyghur_all); } /** * @param string $string * @return string|string[] */ private static function unicode_convert($string) { $result = ""; $La = ""; $length_of_the_string = strlen($string); $postionBefore = 0; $postionMiddle = 0; $position = 0; $i = 0; $canLinkBefore = false; $canLinkAfter = false; $charaterBefore = ""; $charaterMiddle = ""; $charaterAfter = ""; do { $postionBefore = $postionMiddle; $postionMiddle = $position; $charaterAfter = self::getChar($string, $position, $length_of_the_string, false, null); if (isset(self::$letters[$charaterMiddle])) { $letter = self::$letters[$charaterMiddle]; if ($postionBefore == 0) { $canLinkBefore = false; } else { $canLinkBefore = self::checkLinkBefore($charaterMiddle, $charaterBefore); } if ($position == "end or string") { $canLinkAfter = false; } else { $canLinkAfter = self::checkLinkAfter($charaterMiddle, $charaterAfter); } if (strcmp($charaterMiddle, "ا") == 0 && (strcmp($La, "ل") == 0 || strcmp($La, "ﻝ") == 0 || strcmp($La, "ﻟ") == 0)) { $LaPosition = strlen($result) - strlen($La); $result = substr_replace($result, "ﻻ", $LaPosition); } else if (strcmp($charaterMiddle, "ا") == 0 && (strcmp($La, "ل") == 0 || strcmp($La, "ﻠ") == 0 || strcmp($La, "ﻞ") == 0)) { $LaPosition = strlen($result) - strlen($La); $result = substr_replace($result, "ﻼ", $LaPosition); } else if ($canLinkBefore && $canLinkAfter) { $result .= $letter['midGlyph']; $La = $letter['midGlyph']; } else if (!$canLinkBefore && !$canLinkAfter) { $result .= $letter['isoGlyph']; $La = $letter['isoGlyph']; } else if ($canLinkBefore && !$canLinkAfter) { $result .= $letter['endGlyph']; $La = $letter['endGlyph']; } else if (!$canLinkBefore && $canLinkAfter) { $result .= $letter['iniGlyph']; $La = $letter['iniGlyph']; } } else { $result .= $charaterMiddle; $La = $charaterMiddle; } $charaterBefore = $charaterMiddle; $charaterMiddle = $charaterAfter; } while ($postionMiddle < $length_of_the_string); return $result; } /** * @param string $string * @return string */ public static function html_convert($string) { return html_entity_decode(self::unicode_convert($string), ENT_NOQUOTES, "UTF-8"); } /** * utf8 split 实现 * @param $str * @param int $split_len * @return array|bool */ private static function utf8_str_split($str, $split_len = 1) { if (!preg_match('/^[0-9]+$/', $split_len) || $split_len < 1) { return FALSE; } $len = mb_strlen($str, 'UTF-8'); if ($len <= $split_len) { return array($str); } preg_match_all('/.{' . $split_len . '}|[^\x00]{1,' . $split_len . '}$/us', $str, $ar); return $ar[0]; } /** * 内容转photoshop文本模式 * @param string $ThisText 字符串 * @param boolean $convert 转换 * @return null */ public static function convert_to_ps($ThisText, $convert = true) { if ($convert === true) { $ThisText = self::asasiy($ThisText); $ThisText = self::html_convert($ThisText); } /* end */ $ThisText = self::Reverse_String($ThisText); return self::Reverse_Ascii($ThisText); } /** * @param string $source * @return string */ protected static function Reverse_String($source) { return implode("", array_reverse(self::utf8_str_split($source))); } /** * @param string $source * @return string|string[]|null */ private static function Reverse_Ascii($source) { return preg_replace_callback("/([^\x{FB00}-\x{FEFF}\s]+)/u", function ($word) { if (!(stripos($word[0], "يييييييييي") === false)) { $word[0] = str_replace("يييييييييي", "", $word[0]); if (preg_match("/([^\x{4e00}-\x{9fa5}a-zA-Z0-9:]+)/u", $word[0])) { return $word[0]; } } return Uyghur_Convert_Gd_v3::Reverse_String($word[0]); }, $source); } /** * @param string $source * @return string|string[]|null */ protected static function letter_replace($source) { $source = preg_replace("/\s(?=\s)/", "\\1", $source); return $source; } /** * @param string $source * @param bool $convert * @return string|string[]|null */ public static function convert_to_ps2($source, $convert = false) { if ($convert === true) { $source = self::html_convert($source); } /* end */ $source = preg_replace('/^(\d+)(\-)(\s)/', "\\2\\1\\3", $source); $source = preg_replace('/([A-Za-z0-9]+)([.:=]+)(\s)/', "\\2\\1\\3", $source); $source = preg_replace('/$([A-Za-z0-9]+)$/', "t0t0t0t0t0\\1t1t1t1t1t1", $source); $source = preg_replace('/$([A-Za-z0-9]+)/', "t1t1t1t1t1\\1", $source); $source = preg_replace('/([A-Za-z0-9]+)$/', "t0t0t0t0t0\\1", $source); $source = str_replace('»', "t2t2t2t2t2", $source); $source = str_replace('«', "t3t3t3t3t3", $source); $source = self::convert_to_well_number_string($source); $source = explode(" ", $source); $source = self::convert_to_year($source); $new_source = array(); foreach ($source as $key => $value) { if ($key == 0) { if (preg_match("/([^\x{4e00}-\x{9fa5}a-zA-Z0-9:.]+)/u", $value)) { $new_source[] = self::convert_to_brackets($value); continue; } $new_source[] = 'f1f1f1f1f1' . $value . 'n0u0m0r0i0c'; } else { $next = next($source); if (preg_match("/([^\x{4e00}-\x{9fa5}a-zA-Z0-9:.]+)/u", $next)) { $new_source[] = self::convert_to_brackets($value); continue; } $new_source[] = 'f2f2f2f2f2' . $value . 'n0u0m0r0i0c'; } }/* foreach */ $new_source2 = implode(" ", $new_source); $new_source2 = str_replace(' ', 'nbspnbsp', $new_source2); $new_source2 = self::pinyin_implode_string($new_source2); /* replace string */ //$new_source2 = preg_replace('/^(\d+)([A-Za-z0-9:.()])(\s)/', "\\2\\1\\3", $new_source2); $new_source2 = self::convert_to_ps($new_source2, false); /* convert ps text */ $new_source2 = preg_replace("/([\x{FB00}-\x{FEFF}]+)([:])\s([\x{FB00}-\x{FEFF}]+)/u", "\\1 \\2\\3", $new_source2); $new_source2 = preg_replace("/([\x{FB00}-\x{FEFF}]+)([.])\s([\x{FB00}-\x{FEFF}]+)/u", "\\1 \\2\\3", $new_source2); $new_source2 = self::convert_to_well_number_decode($new_source2); /* 特殊字符串 */ $new_source2 = str_replace(array('t0t0t0t0t0', 't1t1t1t1t1'), array('(', ')'), $new_source2); $new_source2 = str_replace(array("t3t3t3t3t3", "t2t2t2t2t2"), array("[", "]"), $new_source2); return $new_source2; } /** * 括号替换 * @param string $source * @return string|string[]|null */ protected static function convert_to_brackets($source) { $source = str_replace(')', 'n3u3m3r3i3c', $source); $source = str_replace('(', ')', $source); $source = str_replace('n3u3m3r3i3c', '(', $source); return $source; } /** * @param array|string $source * @return string|string[]|null */ protected static function convert_to_year($source) { return preg_replace_callback("/([0-9]+)([-])([^\x{0600}-\x{06ff}]+)/uis", function ($word) { return 'يييييييييي' . Uyghur_Convert_Gd_v3::Reverse_String($word['1']) . $word['2'] . $word['3']; }, $source ); } /** * @param array|string $source * @return string|string[]|null */ protected static function convert_to_well_number_string($source) { $source = preg_replace_callback('/([#|%])(\d+)/', function ($word) { return $word['2'] . Uyghur_Convert_Gd_v3::convert_to_well_number_encode($word['1']); }, $source ); $source = preg_replace_callback("/([^\x{0600}-\x{06ff}]+)([:])(\s)/uis", function ($word) { return $word['1'] . Uyghur_Convert_Gd_v3::convert_to_well_number_encode($word['2']) . $word['3']; }, $source ); return $source; } /** * @param array|string $source * @return string|string[]|null */ protected static function convert_to_well_number_encode($source) { $source = str_replace('#', 'f5f0f0f0f1', $source); $source = str_replace('%', 'f5f0f0f0f2', $source); $source = str_replace(':', 'چچچچچچچچچچ', $source); return $source; } /** * @param array|string $source * @return string|string[]|null */ protected static function convert_to_well_number_decode($source) { $source = str_replace('f5f0f0f0f1', '#', $source); $source = str_replace('f5f0f0f0f2', '%', $source); $source = str_replace('چچچچچچچچچچ', ':', $source); return $source; } /** * @param string $source * @return string|string[]|null */ protected static function pinyin_implode_string($source) { return preg_replace_callback("/([a-zA-Z0-9-:.]+)/is", function ($word) { $value = str_replace(array('nbspnbsp', 'n0u0m0r0i0c'), array(' ', ''), $word['0']); $value = implode(" ", array_reverse(explode(" ", $value))); if (!(stripos($value, 'f1f1f1f1f1') === false)) { $value = str_replace(array('f1f1f1f1f1', 'f2f2f2f2f2'), array('', ''), $value); return trim($value) . ' '; } $value = str_replace(array('f1f1f1f1f1', 'f2f2f2f2f2'), array('', ''), $value); if ($value{0} != ' ' && $value{-1} == ' ') { $value = " " . substr($value, 0, -1); } /* end */ return $value; }, $source ); } } ~~~ > 使用方法如下 ~~~ $text = "ياخشىمۇسىز"; // $text = Uyghur_Convert_Gd_v3::convert_to_ps2($text, true); ~~~