企业🤖AI智能体构建引擎,智能编排和调试,一键部署,支持私有化部署方案 广告
~~~ <?php /** * info: 维吾尔语文字处理 * Date: 2016/08/12 * Time: 15:51 */ class Uyghur_Convert_Gd_v3 { private static $uyghur_harp = array("ئ", "ب", "پ", "ت", "ج", "چ", "خ", "س", "ش", "غ", "ف", "ق", "ك", "گ", "ڭ", "ل", "م", "ن", "ھ", "ې", "ى", "ي"); private static $uyghur_all = array("ئ", "ا", "ب", "ە", "پ", "ت", "ج", "چ", "خ", "د", "ر", "ز", "ژ", "س", "ش", "غ", "ف", "ق", "ك", "گ", "ڭ", "ل", "م", "ن", "ھ", "و", "ۇ", "ۆ", "ۈ", "ۋ", "ې", "ى", "ي"); private static $letters = array( "ا" => array("character" => "&#x627;", "isoGlyph" => "&#xFE8D;", "iniGlyph" => "&#xFE8D;", "midGlyph" => "&#xFE8E;", "endGlyph" => "&#xFE8E;"), "ە" => array("character" => "&#x6d5;", "isoGlyph" => "&#xFEE9;", "iniGlyph" => "&#xFEE9;", "midGlyph" => "&#xFEE9;", "endGlyph" => "&#xFEEA;"), "ب" => array("character" => "&#x628;", "isoGlyph" => "&#xFE8F;", "iniGlyph" => "&#xFE91;", "midGlyph" => "&#xFE92;", "endGlyph" => "&#xFE90;"), "پ" => array("character" => "&#x67e;", "isoGlyph" => "&#xFB56;", "iniGlyph" => "&#xFB58;", "midGlyph" => "&#xFB59;", "endGlyph" => "&#xFB57;"), "ت" => array("character" => "&#x62a;", "isoGlyph" => "&#xFE95;", "iniGlyph" => "&#xFE97;", "midGlyph" => "&#xFE98;", "endGlyph" => "&#xFE96;"), "ج" => array("character" => "&#x62c;", "isoGlyph" => "&#xFE9D;", "iniGlyph" => "&#xFE9F;", "midGlyph" => "&#xFEA0;", "endGlyph" => "&#xFE9E;"), "چ" => array("character" => "&#x686;", "isoGlyph" => "&#xFB7A;", "iniGlyph" => "&#xFB7C;", "midGlyph" => "&#xFB7D;", "endGlyph" => "&#xFB7B;"), "خ" => array("character" => "&#x62e;", "isoGlyph" => "&#xFEA5;", "iniGlyph" => "&#xFEA7;", "midGlyph" => "&#xFEA8;", "endGlyph" => "&#xFEA6;"), "د" => array("character" => "&#x62f;", "isoGlyph" => "&#xFEA9;", "iniGlyph" => "&#xFEA9;", "midGlyph" => "&#xFEAA;", "endGlyph" => "&#xFEAA;"), "ر" => array("character" => "&#x631;", "isoGlyph" => "&#xFEAD;", "iniGlyph" => "&#xFEAD;", "midGlyph" => "&#xFEAE;", "endGlyph" => "&#xFEAE;"), "ز" => array("character" => "&#x632;", "isoGlyph" => "&#xFEAF;", "iniGlyph" => "&#xFEAF;", "midGlyph" => "&#xFEB0;", "endGlyph" => "&#xFEB0;"), "ژ" => array("character" => "&#x698;", "isoGlyph" => "&#xFB8A;", "iniGlyph" => "&#xFB8A;", "midGlyph" => "&#xFB8B;", "endGlyph" => "&#xFB8B;"), "س" => array("character" => "&#x633;", "isoGlyph" => "&#xFEB1;", "iniGlyph" => "&#xFEB3;", "midGlyph" => "&#xFEB4;", "endGlyph" => "&#xFEB2;"), "ش" => array("character" => "&#x634;", "isoGlyph" => "&#xFEB5;", "iniGlyph" => "&#xFEB7;", "midGlyph" => "&#xFEB8;", "endGlyph" => "&#xFEB6;"), "غ" => array("character" => "&#x63a;", "isoGlyph" => "&#xFECD;", "iniGlyph" => "&#xFECF;", "midGlyph" => "&#xFED0;", "endGlyph" => "&#xFECE;"), "ق" => array("character" => "&#x642;", "isoGlyph" => "&#xFED5;", "iniGlyph" => "&#xFED7;", "midGlyph" => "&#xFED8;", "endGlyph" => "&#xFED6;"), "ف" => array("character" => "&#x641;", "isoGlyph" => "&#xFED1;", "iniGlyph" => "&#xFED3;", "midGlyph" => "&#xFED4;", "endGlyph" => "&#xFED2;"), "ك" => array("character" => "&#x643;", "isoGlyph" => "&#xFED9;", "iniGlyph" => "&#xFEDB;", "midGlyph" => "&#xFEDC;", "endGlyph" => "&#xFEDA;"), "گ" => array("character" => "&#x6af;", "isoGlyph" => "&#xFB92;", "iniGlyph" => "&#xFB94;", "midGlyph" => "&#xFB95;", "endGlyph" => "&#xFB93;"), "ڭ" => array("character" => "&#x6ad;", "isoGlyph" => "&#xFBD3;", "iniGlyph" => "&#xFBD5;", "midGlyph" => "&#xFBD6;", "endGlyph" => "&#xFBD4;"), "ل" => array("character" => "&#x644;", "isoGlyph" => "&#xFEDD;", "iniGlyph" => "&#xFEDF;", "midGlyph" => "&#xFEE0;", "endGlyph" => "&#xFEDE;"), "م" => array("character" => "&#x645;", "isoGlyph" => "&#xFEE1;", "iniGlyph" => "&#xFEE3;", "midGlyph" => "&#xFEE4;", "endGlyph" => "&#xFEE2;"), "ن" => array("character" => "&#x646;", "isoGlyph" => "&#xFEE5;", "iniGlyph" => "&#xFEE7;", "midGlyph" => "&#xFEE8;", "endGlyph" => "&#xFEE6;"), "ھ" => array("character" => "&#x6be;", "isoGlyph" => "&#xFBAA;", "iniGlyph" => "&#xFBAA;", "midGlyph" => "&#xFBAD;", "endGlyph" => "&#xFBAD;"), "و" => array("character" => "&#x648;", "isoGlyph" => "&#xFEED;", "iniGlyph" => "&#xFEED;", "midGlyph" => "&#xFEEE;", "endGlyph" => "&#xFEEE;"), "ۇ" => array("character" => "&#x6c7;", "isoGlyph" => "&#xFBD7;", "iniGlyph" => "&#xFBD7;", "midGlyph" => "&#xFBD8;", "endGlyph" => "&#xFBD8;"), "ۆ" => array("character" => "&#x6c6;", "isoGlyph" => "&#xFBD9;", "iniGlyph" => "&#xFBD9;", "midGlyph" => "&#xFBDA;", "endGlyph" => "&#xFBDA;"), "ۈ" => array("character" => "&#x6c8;", "isoGlyph" => "&#xFBDB;", "iniGlyph" => "&#xFBDB;", "midGlyph" => "&#xFBDC;", "endGlyph" => "&#xFBDC;"), "ۋ" => array("character" => "&#x6cb;", "isoGlyph" => "&#xFBDE;", "iniGlyph" => "&#xFBDE;", "midGlyph" => "&#xFBDF;", "endGlyph" => "&#xFBDF;"), "ې" => array("character" => "&#x6d0;", "isoGlyph" => "&#xFBE4;", "iniGlyph" => "&#xFBE6;", "midGlyph" => "&#xFBE7;", "endGlyph" => "&#xFBE5;"), "ى" => array("character" => "&#x649;", "isoGlyph" => "&#xFEEF;", "iniGlyph" => "&#xFBE8;", "midGlyph" => "&#xFBE9;", "endGlyph" => "&#xFEF0;"), "ي" => array("character" => "&#x64a;", "isoGlyph" => "&#xFEF1;", "iniGlyph" => "&#xFEF3;", "midGlyph" => "&#xFEF4;", "endGlyph" => "&#xFEF2;"), "ئ" => array("character" => "&#x626;", "isoGlyph" => "&#xFE8B;", "iniGlyph" => "&#xFE8B;", "midGlyph" => "&#xFE8C;", "endGlyph" => "&#xFE8C;"), ); /** * 扩展区转 * @param $text * @return mixed */ private static function asasiy($text) { $text = str_replace(array('ا', 'ﺍ', 'ﺍ', 'ﺎ', 'ﺎ'), "ا", $text); $text = str_replace(array('ە', 'ﻩ', 'ﻩ', 'ﻩ', 'ﻪ'), "ە", $text); $text = str_replace(array('ب', 'ﺏ', 'ﺑ', 'ﺒ', 'ﺐ'), "ب", $text); $text = str_replace(array('پ', 'ﭖ', 'ﭘ', 'ﭙ', 'ﭗ'), "پ", $text); $text = str_replace(array('ت', 'ﺕ', 'ﺗ', 'ﺘ', 'ﺖ'), "ت", $text); $text = str_replace(array('ج', 'ﺝ', 'ﺟ', 'ﺠ', 'ﺞ'), "ج", $text); $text = str_replace(array('چ', 'ﭺ', 'ﭼ', 'ﭽ', 'ﭻ'), "چ", $text); $text = str_replace(array('خ', 'ﺥ', 'ﺧ', 'ﺨ', 'ﺦ'), "خ", $text); $text = str_replace(array('د', 'ﺩ', 'ﺩ', 'ﺪ', 'ﺪ'), "د", $text); $text = str_replace(array('ر', 'ﺭ', 'ﺭ', 'ﺮ', 'ﺮ'), "ر", $text); $text = str_replace(array('ز', 'ﺯ', 'ﺯ', 'ﺰ', 'ﺰ'), "ز", $text); $text = str_replace(array('ژ', 'ﮊ', 'ﮊ', 'ﮋ', 'ﮋ'), "ژ", $text); $text = str_replace(array('س', 'ﺱ', 'ﺳ', 'ﺴ', 'ﺲ'), "س", $text); $text = str_replace(array('ش', 'ﺵ', 'ﺷ', 'ﺸ', 'ﺶ'), "ش", $text); $text = str_replace(array('غ', 'ﻍ', 'ﻏ', 'ﻐ', 'ﻎ'), "غ", $text); $text = str_replace(array('ق', 'ﻕ', 'ﻗ', 'ﻘ', 'ﻖ'), "ق", $text); $text = str_replace(array('ف', 'ﻑ', 'ﻓ', 'ﻔ', 'ﻒ'), "ف", $text); $text = str_replace(array('ك', 'ﻙ', 'ﻛ', 'ﻜ', 'ﻚ'), "ك", $text); $text = str_replace(array('گ', 'ﮒ', 'ﮔ', 'ﮕ', 'ﮓ'), "گ", $text); $text = str_replace(array('ڭ', 'ﯓ', 'ﯕ', 'ﯖ', 'ﯔ'), "ڭ", $text); $text = str_replace(array('ل', 'ﻝ', 'ﻟ', 'ﻠ', 'ﻞ'), "ل", $text); $text = str_replace(array('م', 'ﻡ', 'ﻣ', 'ﻤ', 'ﻢ'), "م", $text); $text = str_replace(array('ن', 'ﻥ', 'ﻧ', 'ﻨ', 'ﻦ'), "ن", $text); $text = str_replace(array('ھ', 'ﮪ', 'ﮪ', 'ﮭ', 'ﮭ'), "ھ", $text); $text = str_replace(array('و', 'ﻭ', 'ﻭ', 'ﻮ', 'ﻮ'), "و", $text); $text = str_replace(array('ۇ', 'ﯗ', 'ﯗ', 'ﯘ', 'ﯘ'), "ۇ", $text); $text = str_replace(array('ۆ', 'ﯙ', 'ﯙ', 'ﯚ', 'ﯚ'), "ۆ", $text); $text = str_replace(array('ۈ', 'ﯛ', 'ﯛ', 'ﯜ', 'ﯜ'), "ۈ", $text); $text = str_replace(array('ۋ', 'ﯞ', 'ﯞ', 'ﯟ', 'ﯟ'), "ۋ", $text); $text = str_replace(array('ې', 'ﯤ', 'ﯦ', 'ﯧ', 'ﯥ'), "ې", $text); $text = str_replace(array('ى', 'ﻯ', 'ﯨ', 'ﯩ', 'ﻰ'), "ى", $text); $text = str_replace(array('ي', 'ﻱ', 'ﻳ', 'ﻴ', 'ﻲ'), "ي", $text); $text = str_replace(array('ئ', 'ﺋ', 'ﺋ', 'ﺌ', 'ﺌ'), "ئ", $text); $text = str_replace(array('ﻻ', 'ﻼ'), "لا", $text); $text = str_replace(':', " : ", $text); $text = str_replace('…', "...", $text); return $text; } /** * Gets the current character from a UTF-8 string * * Returns a substitution character if the first byte is invalid. * Expecting a valid UTF-8 string. Does not check if the bytes following * the first one are valid. * * @param string $string the UTF-8 string * @param integer &$pos the current byte position within the UTF-8 string, * the position is updated to the next character on exit * @param integer $length the length of the UTF-8 string * @param boolean $lookahead update the position to the next UTF-8 character * if true, leaves it unchanged if true * @param string $invalid the ASCII character replacing an invalid byte, e.g. "?", * invalid bytes are silently ignored if null * @return string the UTF-8 character, or false if there are * no more characters to get * @access public */ private static function getChar($string, &$pos, $length, $lookahead = false, $invalid = '?') { if ($pos >= $length) { // no more character to read return false; } // saves the current character position if lookahead $lookahead and $copy = $pos; // gets the first byte $char = $string{$pos++}; if ($char < "\x80") { // a 1-byte character } else if ($char < "\xC0") { // error: invalid as a first byte $char = $invalid; } else if ($char < "\xE0") { // a 2-byte character $char .= $string{$pos++}; } else if ($char < "\xF0") { // a 3-byte character $char .= substr($string, $pos, 2); $pos += 2; } else if ($char < "\xF8") { // a 4-byte character $char .= substr($string, $pos, 3); $pos += 3; } else { // error: out of range as a first byte $char = $invalid; } // restores the current character position if lookahead $lookahead and $pos = $copy; return $char; } /** * @param $currentChar * @param $beforeChar * @return bool */ private static function checkLinkBefore($currentChar, $beforeChar) { return in_array($beforeChar, self::$uyghur_harp); } /** * @param $currentChar * @param $afterChar * @return bool */ private static function checkLinkAfter($currentChar, $afterChar) { if (strcmp($currentChar, "ە") == 0) { return false; } return in_array($afterChar, self::$uyghur_all); } /** * @param string $string * @return string|string[] */ private static function unicode_convert($string) { $result = ""; $La = ""; $length_of_the_string = strlen($string); $postionBefore = 0; $postionMiddle = 0; $position = 0; $i = 0; $canLinkBefore = false; $canLinkAfter = false; $charaterBefore = ""; $charaterMiddle = ""; $charaterAfter = ""; do { $postionBefore = $postionMiddle; $postionMiddle = $position; $charaterAfter = self::getChar($string, $position, $length_of_the_string, false, null); if (isset(self::$letters[$charaterMiddle])) { $letter = self::$letters[$charaterMiddle]; if ($postionBefore == 0) { $canLinkBefore = false; } else { $canLinkBefore = self::checkLinkBefore($charaterMiddle, $charaterBefore); } if ($position == "end or string") { $canLinkAfter = false; } else { $canLinkAfter = self::checkLinkAfter($charaterMiddle, $charaterAfter); } if (strcmp($charaterMiddle, "ا") == 0 && (strcmp($La, "&#x644;") == 0 || strcmp($La, "&#xFEDD;") == 0 || strcmp($La, "&#xFEDF;") == 0)) { $LaPosition = strlen($result) - strlen($La); $result = substr_replace($result, "&#xFEFB;", $LaPosition); } else if (strcmp($charaterMiddle, "ا") == 0 && (strcmp($La, "&#x644;") == 0 || strcmp($La, "&#xFEE0;") == 0 || strcmp($La, "&#xFEDE;") == 0)) { $LaPosition = strlen($result) - strlen($La); $result = substr_replace($result, "&#xFEFC;", $LaPosition); } else if ($canLinkBefore && $canLinkAfter) { $result .= $letter['midGlyph']; $La = $letter['midGlyph']; } else if (!$canLinkBefore && !$canLinkAfter) { $result .= $letter['isoGlyph']; $La = $letter['isoGlyph']; } else if ($canLinkBefore && !$canLinkAfter) { $result .= $letter['endGlyph']; $La = $letter['endGlyph']; } else if (!$canLinkBefore && $canLinkAfter) { $result .= $letter['iniGlyph']; $La = $letter['iniGlyph']; } } else { $result .= $charaterMiddle; $La = $charaterMiddle; } $charaterBefore = $charaterMiddle; $charaterMiddle = $charaterAfter; } while ($postionMiddle < $length_of_the_string); return $result; } /** * @param string $string * @return string */ public static function html_convert($string) { return html_entity_decode(self::unicode_convert($string), ENT_NOQUOTES, "UTF-8"); } /** * utf8 split 实现 * @param $str * @param int $split_len * @return array|bool */ private static function utf8_str_split($str, $split_len = 1) { if (!preg_match('/^[0-9]+$/', $split_len) || $split_len < 1) { return FALSE; } $len = mb_strlen($str, 'UTF-8'); if ($len <= $split_len) { return array($str); } preg_match_all('/.{' . $split_len . '}|[^\x00]{1,' . $split_len . '}$/us', $str, $ar); return $ar[0]; } /** * 内容转photoshop文本模式 * @param string $ThisText 字符串 * @param boolean $convert 转换 * @return null */ public static function convert_to_ps($ThisText, $convert = true) { if ($convert === true) { $ThisText = self::asasiy($ThisText); $ThisText = self::html_convert($ThisText); } /* end */ $ThisText = self::Reverse_String($ThisText); return self::Reverse_Ascii($ThisText); } /** * @param string $source * @return string */ protected static function Reverse_String($source) { return implode("", array_reverse(self::utf8_str_split($source))); } /** * @param string $source * @return string|string[]|null */ private static function Reverse_Ascii($source) { return preg_replace_callback("/([^\x{FB00}-\x{FEFF}\s]+)/u", function ($word) { if (!(stripos($word[0], "يييييييييي") === false)) { $word[0] = str_replace("يييييييييي", "", $word[0]); if (preg_match("/([^\x{4e00}-\x{9fa5}a-zA-Z0-9:]+)/u", $word[0])) { return $word[0]; } } return Uyghur_Convert_Gd_v3::Reverse_String($word[0]); }, $source); } /** * @param string $source * @return string|string[]|null */ protected static function letter_replace($source) { $source = preg_replace("/\s(?=\s)/", "\\1", $source); return $source; } /** * @param string $source * @param bool $convert * @return string|string[]|null */ public static function convert_to_ps2($source, $convert = false) { if ($convert === true) { $source = self::html_convert($source); } /* end */ $source = preg_replace('/^(\d+)(\-)(\s)/', "\\2\\1\\3", $source); $source = preg_replace('/([A-Za-z0-9]+)([.:=]+)(\s)/', "\\2\\1\\3", $source); $source = preg_replace('/\(([A-Za-z0-9]+)\)/', "t0t0t0t0t0\\1t1t1t1t1t1", $source); $source = preg_replace('/\(([A-Za-z0-9]+)/', "t1t1t1t1t1\\1", $source); $source = preg_replace('/([A-Za-z0-9]+)\)/', "t0t0t0t0t0\\1", $source); $source = str_replace('»', "t2t2t2t2t2", $source); $source = str_replace('«', "t3t3t3t3t3", $source); $source = self::convert_to_well_number_string($source); $source = explode(" ", $source); $source = self::convert_to_year($source); $new_source = array(); foreach ($source as $key => $value) { if ($key == 0) { if (preg_match("/([^\x{4e00}-\x{9fa5}a-zA-Z0-9:.]+)/u", $value)) { $new_source[] = self::convert_to_brackets($value); continue; } $new_source[] = 'f1f1f1f1f1' . $value . 'n0u0m0r0i0c'; } else { $next = next($source); if (preg_match("/([^\x{4e00}-\x{9fa5}a-zA-Z0-9:.]+)/u", $next)) { $new_source[] = self::convert_to_brackets($value); continue; } $new_source[] = 'f2f2f2f2f2' . $value . 'n0u0m0r0i0c'; } }/* foreach */ $new_source2 = implode(" ", $new_source); $new_source2 = str_replace(' ', 'nbspnbsp', $new_source2); $new_source2 = self::pinyin_implode_string($new_source2); /* replace string */ //$new_source2 = preg_replace('/^(\d+)([A-Za-z0-9:.()])(\s)/', "\\2\\1\\3", $new_source2); $new_source2 = self::convert_to_ps($new_source2, false); /* convert ps text */ $new_source2 = preg_replace("/([\x{FB00}-\x{FEFF}]+)([:])\s([\x{FB00}-\x{FEFF}]+)/u", "\\1 \\2\\3", $new_source2); $new_source2 = preg_replace("/([\x{FB00}-\x{FEFF}]+)([.])\s([\x{FB00}-\x{FEFF}]+)/u", "\\1 \\2\\3", $new_source2); $new_source2 = self::convert_to_well_number_decode($new_source2); /* 特殊字符串 */ $new_source2 = str_replace(array('t0t0t0t0t0', 't1t1t1t1t1'), array('(', ')'), $new_source2); $new_source2 = str_replace(array("t3t3t3t3t3", "t2t2t2t2t2"), array("[", "]"), $new_source2); return $new_source2; } /** * 括号替换 * @param string $source * @return string|string[]|null */ protected static function convert_to_brackets($source) { $source = str_replace(')', 'n3u3m3r3i3c', $source); $source = str_replace('(', ')', $source); $source = str_replace('n3u3m3r3i3c', '(', $source); return $source; } /** * @param array|string $source * @return string|string[]|null */ protected static function convert_to_year($source) { return preg_replace_callback("/([0-9]+)([-])([^\x{0600}-\x{06ff}]+)/uis", function ($word) { return 'يييييييييي' . Uyghur_Convert_Gd_v3::Reverse_String($word['1']) . $word['2'] . $word['3']; }, $source ); } /** * @param array|string $source * @return string|string[]|null */ protected static function convert_to_well_number_string($source) { $source = preg_replace_callback('/([#|%])(\d+)/', function ($word) { return $word['2'] . Uyghur_Convert_Gd_v3::convert_to_well_number_encode($word['1']); }, $source ); $source = preg_replace_callback("/([^\x{0600}-\x{06ff}]+)([:])(\s)/uis", function ($word) { return $word['1'] . Uyghur_Convert_Gd_v3::convert_to_well_number_encode($word['2']) . $word['3']; }, $source ); return $source; } /** * @param array|string $source * @return string|string[]|null */ protected static function convert_to_well_number_encode($source) { $source = str_replace('#', 'f5f0f0f0f1', $source); $source = str_replace('%', 'f5f0f0f0f2', $source); $source = str_replace(':', 'چچچچچچچچچچ', $source); return $source; } /** * @param array|string $source * @return string|string[]|null */ protected static function convert_to_well_number_decode($source) { $source = str_replace('f5f0f0f0f1', '#', $source); $source = str_replace('f5f0f0f0f2', '%', $source); $source = str_replace('چچچچچچچچچچ', ':', $source); return $source; } /** * @param string $source * @return string|string[]|null */ protected static function pinyin_implode_string($source) { return preg_replace_callback("/([a-zA-Z0-9-:.]+)/is", function ($word) { $value = str_replace(array('nbspnbsp', 'n0u0m0r0i0c'), array(' ', ''), $word['0']); $value = implode(" ", array_reverse(explode(" ", $value))); if (!(stripos($value, 'f1f1f1f1f1') === false)) { $value = str_replace(array('f1f1f1f1f1', 'f2f2f2f2f2'), array('', ''), $value); return trim($value) . ' '; } $value = str_replace(array('f1f1f1f1f1', 'f2f2f2f2f2'), array('', ''), $value); if ($value{0} != ' ' && $value{-1} == ' ') { $value = " " . substr($value, 0, -1); } /* end */ return $value; }, $source ); } } ~~~ > 使用方法如下 ~~~ $text = "ياخشىمۇسىز"; // $text = Uyghur_Convert_Gd_v3::convert_to_ps2($text, true); ~~~