File Peach/DF/Utf8Codec.php

  1: <?php
  2: /*
  3:  * Copyright (c) 2015 @trashtoy
  4:  * https://github.com/trashtoy/
  5:  * 
  6:  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  7:  * this software and associated documentation files (the "Software"), to deal in
  8:  * the Software without restriction, including without limitation the rights to use,
  9:  * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
 10:  * Software, and to permit persons to whom the Software is furnished to do so,
 11:  * subject to the following conditions:
 12:  * 
 13:  * The above copyright notice and this permission notice shall be included in all
 14:  * copies or substantial portions of the Software.
 15:  * 
 16:  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17:  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 18:  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 19:  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 20:  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 21:  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 22:  */
 23: /**
 24:  * PHP class file.
 25:  * @auhtor trashtoy
 26:  * @since  2.1.0
 27:  */
 28: namespace Peach\DF;
 29: use Peach\Util\Strings;
 30: use Exception;
 31: 
 32: /**
 33:  * UTF-8 で符号化された文字列を扱う Codec です.
 34:  * このクラスの decode メソッドは, UTF-8 の文字列を文字単位で分解し,
 35:  * 各文字の Unicode 符号点をあらわす整数の配列を返します.
 36:  * encode メソッドは, 整数の配列を UTF-8 の文字列に変換します.
 37:  * 
 38:  * UTF-8 のビットパターンは以下の通りです.
 39:  * <pre>
 40:  * 0xxxxxxx                                               (00-7f)
 41:  * 110xxxxx 10xxxxxx                                      (c0-df)(80-bf)
 42:  * 1110xxxx 10xxxxxx 10xxxxxx                             (e0-ef)(80-bf)(80-bf)
 43:  * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx                    (f0-f7)(80-bf)(80-bf)(80-bf)
 44:  * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx           (f8-fb)(80-bf)(80-bf)(80-bf)(80-bf)
 45:  * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx  (fc-fd)(80-bf)(80-bf)(80-bf)(80-bf)(80-bf)
 46:  * </pre>
 47:  * 
 48:  * RFC 3629 では 5 バイト以上のシーケンスが無効とされましたが, 
 49:  * このクラスは RFC 2279 に基づいて 5 バイト以上のシーケンスも受理します.
 50:  * なお, このクラスはサロゲートペアを考慮しません.
 51:  * 
 52:  * 引用文献: {@link http://ja.wikipedia.org/wiki/UTF-8 UTF-8 - Wikipedia}
 53:  * 
 54:  * このクラスは将来的に状態 (メンバ変数) を持つ可能性が高いので,
 55:  * 敢えて Singleton パターンにしていません.
 56:  */
 57: class Utf8Codec implements Codec
 58: {
 59:     /**
 60:      * 指定された UTF-8 の文字列を Unicode 符号点の配列に変換します.
 61:      * 
 62:      * @param  string $text UTF-8 でエンコードされた文字列
 63:      * @return array        Unicode 符号点の配列
 64:      */
 65:     public function decode($text)
 66:     {
 67:         $bom = chr(0xEF) . chr(0xBB) . chr(0xBF);
 68:         if (Strings::startsWith($text, $bom)) {
 69:             return $this->decode(substr($text, 3));
 70:         }
 71:         
 72:         $context = new Utf8Context($text);
 73:         $result  = array();
 74:         while ($context->hasNext()) {
 75:             $result[] = $context->next();
 76:         }
 77:         
 78:         // 文字列の末尾に不正な文字が存在していた場合,
 79:         // $result の最後の要素に null が代入されるので取り除く
 80:         $count = count($result);
 81:         if ($count && $result[$count - 1] === null) {
 82:             array_pop($result);
 83:         }
 84:         return $result;
 85:     }
 86:     
 87:     /**
 88:      * 指定された Unicode 符号点の配列を UTF-8 文字列に変換します.
 89:      * 引数には Unicode 符号点をあらわす正の整数の配列を指定してください.
 90:      * 配列以外の値を指定した場合は, その引数 (整数でない場合はメソッド内で整数に変換されます)
 91:      * を Unicode 符号点とみなし, 1 文字分の UTF-8 文字列を返します.
 92:      * 
 93:      * @param  array|int $var Unicode 符号点の配列
 94:      * @return string UTF-8 文字列
 95:      */
 96:     public function encode($var)
 97:     {
 98:         return is_array($var) ? array_reduce($var, array($this, "appendChar"), "") : $this->encodeUnicode($var);
 99:     }
100:     
101:     /**
102:      * 指定された文字列の末尾に, 引数の Unicode 符号点を UTF-8 に変換したバイト列を追加します.
103:      * 
104:      * @param string $result
105:      * @param int    $unicode
106:      * @ignore
107:      */
108:     public function appendChar($result, $unicode)
109:     {
110:         return $result . $this->encodeUnicode($unicode);
111:     }
112:     
113:     /**
114:      * 指定された Unicode 符号点を表現する 1 文字分の UTF-8 文字列を返します.
115:      * @param  int $unicode Unicode 符号点
116:      * @return string 引数の Unicode 文字を表現する UTF-8 文字列
117:      */
118:     private function encodeUnicode($unicode)
119:     {
120:         if (!is_int($unicode)) {
121:             return $this->encodeUnicode(intval($unicode));
122:         }
123:         if ($unicode < 0 || 0xFFFF < $unicode) {
124:             return $this->encodeUnicode(max(0, $unicode % 0x200000));
125:         }
126:         
127:         $count = $this->getCharCount($unicode);
128:         if ($count === 1) {
129:             return chr($unicode);
130:         }
131:         
132:         $result = array();
133:         for ($i = 1; $i < $count; $i++) {
134:             array_unshift($result, 0x80 + $unicode % 64); // Last 6 bit
135:             $unicode >>= 6;
136:         }
137:         array_unshift($result, $this->getFirstCharPrefix($count) + $unicode);
138:         return implode("", array_map("chr", $result));
139:     }
140:     
141:     /**
142:      * 指定された Unicode 符号点が UTF-8 において何バイトで表現されるか調べます.
143:      * @param  int $unicode Unicode 符号点
144:      * @return int バイト数
145:      */
146:     private function getCharCount($unicode)
147:     {
148:         static $borders = array(
149:             1 => 0x80,       //  7 bit
150:             2 => 0x800,      // 11 bit
151:             3 => 0x10000,    // 16 bit
152:             4 => 0x200000,   // 21 bit
153:         );
154:         foreach ($borders as $i => $border) {
155:             if ($unicode < $border) {
156:                 return $i;
157:             }
158:         }
159:         // @codeCoverageIgnoreStart
160:         throw new Exception("Illegal state");
161:         // @codeCoverageIgnoreEnd
162:     }
163:     
164:     /**
165:      * 引数の値に応じて以下の値を返します. (2 進数表現)
166:      * 
167:      * - 1: 00000000
168:      * - 2: 11000000
169:      * - 3: 11100000
170:      * - 4: 11110000
171:      * - 5: 11111000
172:      * - 6: 11111100
173:      * 
174:      * @param  int $count バイト数
175:      * @return int 引数のバイト数に応じた値
176:      */
177:     private function getFirstCharPrefix($count)
178:     {
179:         $result = 0;
180:         for ($i = 0; $i < $count; $i++) {
181:             $result >>= 1;
182:             $result += 0x80;
183:         }
184:         return $result;
185:     }
186: }
187:
Namespaces

Classes

Interfaces