Annotation of /trunk/lib/iptc/Unicode.php
Parent Directory
|
Revision Log
Revision 2 - (view) (download)
| 1 : | andphe | 2 | <?php |
| 2 : | /** | ||
| 3 : | * @package zOOmGallery | ||
| 4 : | * @author Mike de Boer <mailme@mikedeboer.nl> | ||
| 5 : | **/ | ||
| 6 : | /****************************************************************************** | ||
| 7 : | * | ||
| 8 : | * Filename: Unicode.php | ||
| 9 : | * | ||
| 10 : | * Description: Provides functions for handling Unicode strings in PHP without | ||
| 11 : | * needing to configure the non-default mbstring extension | ||
| 12 : | * | ||
| 13 : | * Author: Evan Hunter | ||
| 14 : | * | ||
| 15 : | * Date: 27/7/2004 | ||
| 16 : | * | ||
| 17 : | * Project: JPEG Metadata | ||
| 18 : | * | ||
| 19 : | * Revision: 1.10 | ||
| 20 : | * | ||
| 21 : | * Changes: 1.00 -> 1.10 : Added the following functions: | ||
| 22 : | * smart_HTML_Entities | ||
| 23 : | * smart_htmlspecialchars | ||
| 24 : | * HTML_UTF16_UnEscape | ||
| 25 : | * HTML_UTF8_UnEscape | ||
| 26 : | * changed HTML_UTF8_Escape and HTML_UTF16_Escape to | ||
| 27 : | * use smart_htmlspecialchars, so that characters which | ||
| 28 : | * were already escaped would remain intact | ||
| 29 : | * | ||
| 30 : | * | ||
| 31 : | * URL: http://electronics.ozhiker.com | ||
| 32 : | * | ||
| 33 : | * License: This file is part of the PHP JPEG Metadata Toolkit. | ||
| 34 : | * | ||
| 35 : | * The PHP JPEG Metadata Toolkit is free software; you can | ||
| 36 : | * redistribute it and/or modify it under the terms of the | ||
| 37 : | * GNU General Public License as published by the Free Software | ||
| 38 : | * Foundation; either version 2 of the License, or (at your | ||
| 39 : | * option) any later version. | ||
| 40 : | * | ||
| 41 : | * The PHP JPEG Metadata Toolkit is distributed in the hope | ||
| 42 : | * that it will be useful, but WITHOUT ANY WARRANTY; without | ||
| 43 : | * even the implied warranty of MERCHANTABILITY or FITNESS | ||
| 44 : | * FOR A PARTICULAR PURPOSE. See the GNU General Public License | ||
| 45 : | * for more details. | ||
| 46 : | * | ||
| 47 : | * You should have received a copy of the GNU General Public | ||
| 48 : | * License along with the PHP JPEG Metadata Toolkit; if not, | ||
| 49 : | * write to the Free Software Foundation, Inc., 59 Temple | ||
| 50 : | * Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 51 : | * | ||
| 52 : | * If you require a different license for commercial or other | ||
| 53 : | * purposes, please contact the author: evan@ozhiker.com | ||
| 54 : | * | ||
| 55 : | ******************************************************************************/ | ||
| 56 : | // MOS Intruder Alerts | ||
| 57 : | defined( '_VALID_MOS' ) or die( 'Direct Access to this location is not allowed.' ); | ||
| 58 : | |||
| 59 : | // TODO: UTF-16 functions have not been tested fully | ||
| 60 : | |||
| 61 : | |||
| 62 : | |||
| 63 : | /****************************************************************************** | ||
| 64 : | * | ||
| 65 : | * Unicode UTF-8 Encoding Functions | ||
| 66 : | * | ||
| 67 : | * Description: UTF-8 is a Unicode encoding system in which extended characters | ||
| 68 : | * use only the upper half (128 values) of the byte range, thus it | ||
| 69 : | * allows the use of normal 7-bit ASCII text. | ||
| 70 : | * 7-Bit ASCII will pass straight through UTF-8 encoding/decoding without change | ||
| 71 : | * | ||
| 72 : | * | ||
| 73 : | * The encoding is as follows: | ||
| 74 : | * Unicode Value : Binary representation (x=data bit) | ||
| 75 : | *-------------------------------------------------------------------------------- | ||
| 76 : | * U-00000000 - U-0000007F: 0xxxxxxx <- This is 7-bit ASCII | ||
| 77 : | * U-00000080 - U-000007FF: 110xxxxx 10xxxxxx | ||
| 78 : | * U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx | ||
| 79 : | * U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | ||
| 80 : | * U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | ||
| 81 : | * U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | ||
| 82 : | *-------------------------------------------------------------------------------- | ||
| 83 : | * | ||
| 84 : | ******************************************************************************/ | ||
| 85 : | |||
| 86 : | |||
| 87 : | |||
| 88 : | |||
| 89 : | /****************************************************************************** | ||
| 90 : | * | ||
| 91 : | * Unicode UTF-16 Encoding Functions | ||
| 92 : | * | ||
| 93 : | * Description: UTF-16 is a Unicode encoding system uses 16 bit values for representing | ||
| 94 : | * characters. | ||
| 95 : | * It also has an extended set of characters available by the use | ||
| 96 : | * of surrogate pairs, which are a pair of 16 bit values, giving a | ||
| 97 : | * total data length of 20 useful bits. | ||
| 98 : | * | ||
| 99 : | * | ||
| 100 : | * The encoding is as follows: | ||
| 101 : | * Unicode Value : Binary representation (x=data bit) | ||
| 102 : | *-------------------------------------------------------------------------------- | ||
| 103 : | * U-000000 - U-00D7FF: xxxxxxxx xxxxxxxx | ||
| 104 : | * U-00D800 - U-00DBFF: Not available - used for high surrogate pairs | ||
| 105 : | * U-00DC00 - U-00DFFF: Not available - used for low surrogate pairs | ||
| 106 : | U-00E000 - U-00FFFF: xxxxxxxx xxxxxxxx | ||
| 107 : | * U-010000 - U-10FFFF: 110110ww wwxxxxxx 110111xx xxxxxxxx ( wwww = (uni-0x10000)/0x10000 ) | ||
| 108 : | *-------------------------------------------------------------------------------- | ||
| 109 : | * | ||
| 110 : | * Surrogate pair Calculations | ||
| 111 : | * | ||
| 112 : | * $hi = ($uni - 0x10000) / 0x400 + 0xD800; | ||
| 113 : | * $lo = ($uni - 0x10000) % 0x400 + 0xDC00; | ||
| 114 : | * | ||
| 115 : | * | ||
| 116 : | * $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); | ||
| 117 : | * | ||
| 118 : | * | ||
| 119 : | ******************************************************************************/ | ||
| 120 : | |||
| 121 : | |||
| 122 : | |||
| 123 : | |||
| 124 : | |||
| 125 : | |||
| 126 : | /****************************************************************************** | ||
| 127 : | * | ||
| 128 : | * Function: UTF8_fix | ||
| 129 : | * | ||
| 130 : | * Description: Checks a string for badly formed Unicode UTF-8 coding and | ||
| 131 : | * returns the same string containing only the parts which | ||
| 132 : | * were properly formed UTF-8 data. | ||
| 133 : | * | ||
| 134 : | * Parameters: utf8_text - a string with possibly badly formed UTF-8 data | ||
| 135 : | * | ||
| 136 : | * Returns: output - the well formed UTF-8 version of the string | ||
| 137 : | * | ||
| 138 : | ******************************************************************************/ | ||
| 139 : | |||
| 140 : | function UTF8_fix( $utf8_text ) | ||
| 141 : | { | ||
| 142 : | // Initialise the current position in the string | ||
| 143 : | $pos = 0; | ||
| 144 : | |||
| 145 : | // Create a string to accept the well formed output | ||
| 146 : | $output = "" ; | ||
| 147 : | |||
| 148 : | // Cycle through each group of bytes, ensuring the coding is correct | ||
| 149 : | while ( $pos < strlen( $utf8_text ) ) | ||
| 150 : | { | ||
| 151 : | // Retreive the current numerical character value | ||
| 152 : | $chval = ord($utf8_text{$pos}); | ||
| 153 : | |||
| 154 : | // Check what the first character is - it will tell us how many bytes the | ||
| 155 : | // Unicode value covers | ||
| 156 : | |||
| 157 : | if ( ( $chval >= 0x00 ) && ( $chval <= 0x7F ) ) | ||
| 158 : | { | ||
| 159 : | // 1 Byte UTF-8 Unicode (7-Bit ASCII) Character | ||
| 160 : | $bytes = 1; | ||
| 161 : | } | ||
| 162 : | else if ( ( $chval >= 0xC0 ) && ( $chval <= 0xDF ) ) | ||
| 163 : | { | ||
| 164 : | // 2 Byte UTF-8 Unicode Character | ||
| 165 : | $bytes = 2; | ||
| 166 : | } | ||
| 167 : | else if ( ( $chval >= 0xE0 ) && ( $chval <= 0xEF ) ) | ||
| 168 : | { | ||
| 169 : | // 3 Byte UTF-8 Unicode Character | ||
| 170 : | $bytes = 3; | ||
| 171 : | } | ||
| 172 : | else if ( ( $chval >= 0xF0 ) && ( $chval <= 0xF7 ) ) | ||
| 173 : | { | ||
| 174 : | // 4 Byte UTF-8 Unicode Character | ||
| 175 : | $bytes = 4; | ||
| 176 : | } | ||
| 177 : | else if ( ( $chval >= 0xF8 ) && ( $chval <= 0xFB ) ) | ||
| 178 : | { | ||
| 179 : | // 5 Byte UTF-8 Unicode Character | ||
| 180 : | $bytes = 5; | ||
| 181 : | } | ||
| 182 : | else if ( ( $chval >= 0xFC ) && ( $chval <= 0xFD ) ) | ||
| 183 : | { | ||
| 184 : | // 6 Byte UTF-8 Unicode Character | ||
| 185 : | $bytes = 6; | ||
| 186 : | } | ||
| 187 : | else | ||
| 188 : | { | ||
| 189 : | // Invalid Code - skip character and do nothing | ||
| 190 : | $bytes = 0; | ||
| 191 : | $pos++; | ||
| 192 : | } | ||
| 193 : | |||
| 194 : | |||
| 195 : | // check that there is enough data remaining to read | ||
| 196 : | if (($pos + $bytes - 1) < strlen( $utf8_text ) ) | ||
| 197 : | { | ||
| 198 : | // Cycle through the number of bytes specified, | ||
| 199 : | // copying them to the output string | ||
| 200 : | while ( $bytes > 0 ) | ||
| 201 : | { | ||
| 202 : | $output .= $utf8_text{$pos}; | ||
| 203 : | $pos++; | ||
| 204 : | $bytes--; | ||
| 205 : | } | ||
| 206 : | } | ||
| 207 : | else | ||
| 208 : | { | ||
| 209 : | break; | ||
| 210 : | } | ||
| 211 : | } | ||
| 212 : | |||
| 213 : | // Return the result | ||
| 214 : | return $output; | ||
| 215 : | } | ||
| 216 : | |||
| 217 : | /****************************************************************************** | ||
| 218 : | * End of Function: UTF8_fix | ||
| 219 : | ******************************************************************************/ | ||
| 220 : | |||
| 221 : | |||
| 222 : | |||
| 223 : | |||
| 224 : | |||
| 225 : | |||
| 226 : | |||
| 227 : | |||
| 228 : | |||
| 229 : | /****************************************************************************** | ||
| 230 : | * | ||
| 231 : | * Function: UTF16_fix | ||
| 232 : | * | ||
| 233 : | * Description: Checks a string for badly formed Unicode UTF-16 coding and | ||
| 234 : | * returns the same string containing only the parts which | ||
| 235 : | * were properly formed UTF-16 data. | ||
| 236 : | * | ||
| 237 : | * Parameters: utf16_text - a string with possibly badly formed UTF-16 data | ||
| 238 : | * MSB_first - True will cause processing as Big Endian UTF-16 (Motorola, MSB first) | ||
| 239 : | * False will cause processing as Little Endian UTF-16 (Intel, LSB first) | ||
| 240 : | * | ||
| 241 : | * Returns: output - the well formed UTF-16 version of the string | ||
| 242 : | * | ||
| 243 : | ******************************************************************************/ | ||
| 244 : | |||
| 245 : | function UTF16_fix( $utf16_text, $MSB_first ) | ||
| 246 : | { | ||
| 247 : | // Initialise the current position in the string | ||
| 248 : | $pos = 0; | ||
| 249 : | |||
| 250 : | // Create a string to accept the well formed output | ||
| 251 : | $output = "" ; | ||
| 252 : | |||
| 253 : | // Cycle through each group of bytes, ensuring the coding is correct | ||
| 254 : | while ( $pos < strlen( $utf16_text ) ) | ||
| 255 : | { | ||
| 256 : | // Retreive the current numerical character value | ||
| 257 : | $chval1 = ord($utf16_text{$pos}); | ||
| 258 : | |||
| 259 : | // Skip over character just read | ||
| 260 : | $pos++; | ||
| 261 : | |||
| 262 : | // Check if there is another character available | ||
| 263 : | if ( $pos < strlen( $utf16_text ) ) | ||
| 264 : | { | ||
| 265 : | // Another character is available - get it for the second half of the UTF-16 value | ||
| 266 : | $chval2 = ord( $utf16_text{$pos} ); | ||
| 267 : | } | ||
| 268 : | else | ||
| 269 : | { | ||
| 270 : | // Error - no second byte to this UTF-16 value - end processing | ||
| 271 : | continue 1; | ||
| 272 : | } | ||
| 273 : | |||
| 274 : | // Skip over character just read | ||
| 275 : | $pos++; | ||
| 276 : | |||
| 277 : | // Calculate the 16 bit unicode value | ||
| 278 : | if ( $MSB_first ) | ||
| 279 : | { | ||
| 280 : | // Big Endian | ||
| 281 : | $UTF16_val = $chval1 * 0x100 + $chval2; | ||
| 282 : | } | ||
| 283 : | else | ||
| 284 : | { | ||
| 285 : | // Little Endian | ||
| 286 : | $UTF16_val = $chval2 * 0x100 + $chval1; | ||
| 287 : | } | ||
| 288 : | |||
| 289 : | |||
| 290 : | |||
| 291 : | if ( ( ( $UTF16_val >= 0x0000 ) && ( $UTF16_val <= 0xD7FF ) ) || | ||
| 292 : | ( ( $UTF16_val >= 0xE000 ) && ( $UTF16_val <= 0xFFFF ) ) ) | ||
| 293 : | { | ||
| 294 : | // Normal Character (Non Surrogate pair) | ||
| 295 : | // Add it to the output | ||
| 296 : | $output .= chr( $chval1 ) . chr ( $chval2 ); | ||
| 297 : | } | ||
| 298 : | else if ( ( $UTF16_val >= 0xD800 ) && ( $UTF16_val <= 0xDBFF ) ) | ||
| 299 : | { | ||
| 300 : | // High surrogate of a surrogate pair | ||
| 301 : | // Now we need to read the low surrogate | ||
| 302 : | // Check if there is another 2 characters available | ||
| 303 : | if ( ( $pos + 3 ) < strlen( $utf16_text ) ) | ||
| 304 : | { | ||
| 305 : | // Another 2 characters are available - get them | ||
| 306 : | $chval3 = ord( $utf16_text{$pos} ); | ||
| 307 : | $chval4 = ord( $utf16_text{$pos+1} ); | ||
| 308 : | |||
| 309 : | // Calculate the second 16 bit unicode value | ||
| 310 : | if ( $MSB_first ) | ||
| 311 : | { | ||
| 312 : | // Big Endian | ||
| 313 : | $UTF16_val2 = $chval3 * 0x100 + $chval4; | ||
| 314 : | } | ||
| 315 : | else | ||
| 316 : | { | ||
| 317 : | // Little Endian | ||
| 318 : | $UTF16_val2 = $chval4 * 0x100 + $chval3; | ||
| 319 : | } | ||
| 320 : | |||
| 321 : | // Check that this is a low surrogate | ||
| 322 : | if ( ( $UTF16_val2 >= 0xDC00 ) && ( $UTF16_val2 <= 0xDFFF ) ) | ||
| 323 : | { | ||
| 324 : | // Low surrogate found following high surrogate | ||
| 325 : | // Add both to the output | ||
| 326 : | $output .= chr( $chval1 ) . chr ( $chval2 ) . chr( $chval3 ) . chr ( $chval4 ); | ||
| 327 : | |||
| 328 : | // Skip over the low surrogate | ||
| 329 : | $pos += 2; | ||
| 330 : | } | ||
| 331 : | else | ||
| 332 : | { | ||
| 333 : | // Low surrogate not found after high surrogate | ||
| 334 : | // Don't add either to the output | ||
| 335 : | // Only the High surrogate is skipped and processing continues after it | ||
| 336 : | } | ||
| 337 : | |||
| 338 : | } | ||
| 339 : | else | ||
| 340 : | { | ||
| 341 : | // Error - not enough data for low surrogate - end processing | ||
| 342 : | continue 1; | ||
| 343 : | } | ||
| 344 : | |||
| 345 : | } | ||
| 346 : | else | ||
| 347 : | { | ||
| 348 : | // Low surrogate of a surrogate pair | ||
| 349 : | // This should not happen - it means this is a lone low surrogate | ||
| 350 : | // Dont add it to the output | ||
| 351 : | } | ||
| 352 : | |||
| 353 : | } | ||
| 354 : | |||
| 355 : | // Return the result | ||
| 356 : | return $output; | ||
| 357 : | } | ||
| 358 : | |||
| 359 : | /****************************************************************************** | ||
| 360 : | * End of Function: UTF16_fix | ||
| 361 : | ******************************************************************************/ | ||
| 362 : | |||
| 363 : | |||
| 364 : | |||
| 365 : | |||
| 366 : | |||
| 367 : | /****************************************************************************** | ||
| 368 : | * | ||
| 369 : | * Function: UTF8_to_unicode_array | ||
| 370 : | * | ||
| 371 : | * Description: Converts a string encoded with Unicode UTF-8, to an array of | ||
| 372 : | * numbers which represent unicode character numbers | ||
| 373 : | * | ||
| 374 : | * Parameters: utf8_text - a string containing the UTF-8 data | ||
| 375 : | * | ||
| 376 : | * Returns: output - the array containing the unicode character numbers | ||
| 377 : | * | ||
| 378 : | ******************************************************************************/ | ||
| 379 : | |||
| 380 : | function UTF8_to_unicode_array( $utf8_text ) | ||
| 381 : | { | ||
| 382 : | // Create an array to receive the unicode character numbers output | ||
| 383 : | $output = array( ); | ||
| 384 : | |||
| 385 : | // Cycle through the characters in the UTF-8 string | ||
| 386 : | for ( $pos = 0; $pos < strlen( $utf8_text ); $pos++ ) | ||
| 387 : | { | ||
| 388 : | // Retreive the current numerical character value | ||
| 389 : | $chval = ord($utf8_text{$pos}); | ||
| 390 : | |||
| 391 : | // Check what the first character is - it will tell us how many bytes the | ||
| 392 : | // Unicode value covers | ||
| 393 : | |||
| 394 : | if ( ( $chval >= 0x00 ) && ( $chval <= 0x7F ) ) | ||
| 395 : | { | ||
| 396 : | // 1 Byte UTF-8 Unicode (7-Bit ASCII) Character | ||
| 397 : | $bytes = 1; | ||
| 398 : | $outputval = $chval; // Since 7-bit ASCII is unaffected, the output equals the input | ||
| 399 : | } | ||
| 400 : | else if ( ( $chval >= 0xC0 ) && ( $chval <= 0xDF ) ) | ||
| 401 : | { | ||
| 402 : | // 2 Byte UTF-8 Unicode | ||
| 403 : | $bytes = 2; | ||
| 404 : | $outputval = $chval & 0x1F; // The first byte is bitwise ANDed with 0x1F to remove the leading 110b | ||
| 405 : | } | ||
| 406 : | else if ( ( $chval >= 0xE0 ) && ( $chval <= 0xEF ) ) | ||
| 407 : | { | ||
| 408 : | // 3 Byte UTF-8 Unicode | ||
| 409 : | $bytes = 3; | ||
| 410 : | $outputval = $chval & 0x0F; // The first byte is bitwise ANDed with 0x0F to remove the leading 1110b | ||
| 411 : | } | ||
| 412 : | else if ( ( $chval >= 0xF0 ) && ( $chval <= 0xF7 ) ) | ||
| 413 : | { | ||
| 414 : | // 4 Byte UTF-8 Unicode | ||
| 415 : | $bytes = 4; | ||
| 416 : | $outputval = $chval & 0x07; // The first byte is bitwise ANDed with 0x07 to remove the leading 11110b | ||
| 417 : | } | ||
| 418 : | else if ( ( $chval >= 0xF8 ) && ( $chval <= 0xFB ) ) | ||
| 419 : | { | ||
| 420 : | // 5 Byte UTF-8 Unicode | ||
| 421 : | $bytes = 5; | ||
| 422 : | $outputval = $chval & 0x03; // The first byte is bitwise ANDed with 0x03 to remove the leading 111110b | ||
| 423 : | } | ||
| 424 : | else if ( ( $chval >= 0xFC ) && ( $chval <= 0xFD ) ) | ||
| 425 : | { | ||
| 426 : | // 6 Byte UTF-8 Unicode | ||
| 427 : | $bytes = 6; | ||
| 428 : | $outputval = $chval & 0x01; // The first byte is bitwise ANDed with 0x01 to remove the leading 1111110b | ||
| 429 : | } | ||
| 430 : | else | ||
| 431 : | { | ||
| 432 : | // Invalid Code - do nothing | ||
| 433 : | $bytes = 0; | ||
| 434 : | } | ||
| 435 : | |||
| 436 : | // Check if the byte was valid | ||
| 437 : | if ( $bytes !== 0 ) | ||
| 438 : | { | ||
| 439 : | // The byte was valid | ||
| 440 : | |||
| 441 : | // Check if there is enough data left in the UTF-8 string to allow the | ||
| 442 : | // retrieval of the remainder of this unicode character | ||
| 443 : | if ( $pos + $bytes - 1 < strlen( $utf8_text ) ) | ||
| 444 : | { | ||
| 445 : | // The UTF-8 string is long enough | ||
| 446 : | |||
| 447 : | // Cycle through the number of bytes required, | ||
| 448 : | // minus the first one which has already been done | ||
| 449 : | while ( $bytes > 1 ) | ||
| 450 : | { | ||
| 451 : | $pos++; | ||
| 452 : | $bytes--; | ||
| 453 : | |||
| 454 : | // Each remaining byte is coded with 6 bits of data and 10b on the high | ||
| 455 : | // order bits. Hence we need to shift left by 6 bits (0x40) then add the | ||
| 456 : | // current characer after it has been bitwise ANDed with 0x3F to remove the | ||
| 457 : | // highest two bits. | ||
| 458 : | $outputval = $outputval*0x40 + ( (ord($utf8_text{$pos})) & 0x3F ); | ||
| 459 : | } | ||
| 460 : | |||
| 461 : | // Add the calculated Unicode number to the output array | ||
| 462 : | $output[] = $outputval; | ||
| 463 : | } | ||
| 464 : | } | ||
| 465 : | |||
| 466 : | } | ||
| 467 : | |||
| 468 : | // Return the resulting array | ||
| 469 : | return $output; | ||
| 470 : | } | ||
| 471 : | |||
| 472 : | /****************************************************************************** | ||
| 473 : | * End of Function: UTF8_to_unicode_array | ||
| 474 : | ******************************************************************************/ | ||
| 475 : | |||
| 476 : | |||
| 477 : | |||
| 478 : | |||
| 479 : | |||
| 480 : | /****************************************************************************** | ||
| 481 : | * | ||
| 482 : | * Function: UTF16_to_unicode_array | ||
| 483 : | * | ||
| 484 : | * Description: Converts a string encoded with Unicode UTF-16, to an array of | ||
| 485 : | * numbers which represent unicode character numbers | ||
| 486 : | * | ||
| 487 : | * Parameters: utf16_text - a string containing the UTF-16 data | ||
| 488 : | * MSB_first - True will cause processing as Big Endian UTF-16 (Motorola, MSB first) | ||
| 489 : | * False will cause processing as Little Endian UTF-16 (Intel, LSB first) | ||
| 490 : | * | ||
| 491 : | * Returns: output - the array containing the unicode character numbers | ||
| 492 : | * | ||
| 493 : | ******************************************************************************/ | ||
| 494 : | |||
| 495 : | function UTF16_to_unicode_array( $utf16_text, $MSB_first ) | ||
| 496 : | { | ||
| 497 : | // Create an array to receive the unicode character numbers output | ||
| 498 : | $output = array( ); | ||
| 499 : | |||
| 500 : | |||
| 501 : | // Initialise the current position in the string | ||
| 502 : | $pos = 0; | ||
| 503 : | |||
| 504 : | // Cycle through each group of bytes, ensuring the coding is correct | ||
| 505 : | while ( $pos < strlen( $utf16_text ) ) | ||
| 506 : | { | ||
| 507 : | // Retreive the current numerical character value | ||
| 508 : | $chval1 = ord($utf16_text{$pos}); | ||
| 509 : | |||
| 510 : | // Skip over character just read | ||
| 511 : | $pos++; | ||
| 512 : | |||
| 513 : | // Check if there is another character available | ||
| 514 : | if ( $pos < strlen( $utf16_text ) ) | ||
| 515 : | { | ||
| 516 : | // Another character is available - get it for the second half of the UTF-16 value | ||
| 517 : | $chval2 = ord( $utf16_text{$pos} ); | ||
| 518 : | } | ||
| 519 : | else | ||
| 520 : | { | ||
| 521 : | // Error - no second byte to this UTF-16 value - end processing | ||
| 522 : | continue 1; | ||
| 523 : | } | ||
| 524 : | |||
| 525 : | // Skip over character just read | ||
| 526 : | $pos++; | ||
| 527 : | |||
| 528 : | // Calculate the 16 bit unicode value | ||
| 529 : | if ( $MSB_first ) | ||
| 530 : | { | ||
| 531 : | // Big Endian | ||
| 532 : | $UTF16_val = $chval1 * 0x100 + $chval2; | ||
| 533 : | } | ||
| 534 : | else | ||
| 535 : | { | ||
| 536 : | // Little Endian | ||
| 537 : | $UTF16_val = $chval2 * 0x100 + $chval1; | ||
| 538 : | } | ||
| 539 : | |||
| 540 : | |||
| 541 : | if ( ( ( $UTF16_val >= 0x0000 ) && ( $UTF16_val <= 0xD7FF ) ) || | ||
| 542 : | ( ( $UTF16_val >= 0xE000 ) && ( $UTF16_val <= 0xFFFF ) ) ) | ||
| 543 : | { | ||
| 544 : | // Normal Character (Non Surrogate pair) | ||
| 545 : | // Add it to the output | ||
| 546 : | $output[] = $UTF16_val; | ||
| 547 : | } | ||
| 548 : | else if ( ( $UTF16_val >= 0xD800 ) && ( $UTF16_val <= 0xDBFF ) ) | ||
| 549 : | { | ||
| 550 : | // High surrogate of a surrogate pair | ||
| 551 : | // Now we need to read the low surrogate | ||
| 552 : | // Check if there is another 2 characters available | ||
| 553 : | if ( ( $pos + 3 ) < strlen( $utf16_text ) ) | ||
| 554 : | { | ||
| 555 : | // Another 2 characters are available - get them | ||
| 556 : | $chval3 = ord( $utf16_text{$pos} ); | ||
| 557 : | $chval4 = ord( $utf16_text{$pos+1} ); | ||
| 558 : | |||
| 559 : | // Calculate the second 16 bit unicode value | ||
| 560 : | if ( $MSB_first ) | ||
| 561 : | { | ||
| 562 : | // Big Endian | ||
| 563 : | $UTF16_val2 = $chval3 * 0x100 + $chval4; | ||
| 564 : | } | ||
| 565 : | else | ||
| 566 : | { | ||
| 567 : | // Little Endian | ||
| 568 : | $UTF16_val2 = $chval4 * 0x100 + $chval3; | ||
| 569 : | } | ||
| 570 : | |||
| 571 : | // Check that this is a low surrogate | ||
| 572 : | if ( ( $UTF16_val2 >= 0xDC00 ) && ( $UTF16_val2 <= 0xDFFF ) ) | ||
| 573 : | { | ||
| 574 : | // Low surrogate found following high surrogate | ||
| 575 : | // Add both to the output | ||
| 576 : | $output[] = 0x10000 + ( ( $UTF16_val - 0xD800 ) * 0x400 ) + ( $UTF16_val2 - 0xDC00 ); | ||
| 577 : | |||
| 578 : | // Skip over the low surrogate | ||
| 579 : | $pos += 2; | ||
| 580 : | } | ||
| 581 : | else | ||
| 582 : | { | ||
| 583 : | // Low surrogate not found after high surrogate | ||
| 584 : | // Don't add either to the output | ||
| 585 : | // The high surrogate is skipped and processing continued | ||
| 586 : | } | ||
| 587 : | |||
| 588 : | } | ||
| 589 : | else | ||
| 590 : | { | ||
| 591 : | // Error - not enough data for low surrogate - end processing | ||
| 592 : | continue 1; | ||
| 593 : | } | ||
| 594 : | |||
| 595 : | } | ||
| 596 : | else | ||
| 597 : | { | ||
| 598 : | // Low surrogate of a surrogate pair | ||
| 599 : | // This should not happen - it means this is a lone low surrogate | ||
| 600 : | // Don't add it to the output | ||
| 601 : | } | ||
| 602 : | |||
| 603 : | } | ||
| 604 : | |||
| 605 : | // Return the result | ||
| 606 : | return $output; | ||
| 607 : | |||
| 608 : | |||
| 609 : | } | ||
| 610 : | |||
| 611 : | /****************************************************************************** | ||
| 612 : | * End of Function: UTF16_to_unicode_array | ||
| 613 : | ******************************************************************************/ | ||
| 614 : | |||
| 615 : | |||
| 616 : | |||
| 617 : | |||
| 618 : | |||
| 619 : | |||
| 620 : | |||
| 621 : | /****************************************************************************** | ||
| 622 : | * | ||
| 623 : | * Function: unicode_array_to_UTF8 | ||
| 624 : | * | ||
| 625 : | * Description: Converts an array of unicode character numbers to a string | ||
| 626 : | * encoded by UTF-8 | ||
| 627 : | * | ||
| 628 : | * Parameters: unicode_array - the array containing unicode character numbers | ||
| 629 : | * | ||
| 630 : | * Returns: output - the UTF-8 encoded string representing the data | ||
| 631 : | * | ||
| 632 : | ******************************************************************************/ | ||
| 633 : | |||
| 634 : | function unicode_array_to_UTF8( $unicode_array ) | ||
| 635 : | { | ||
| 636 : | |||
| 637 : | // Create a string to receive the UTF-8 output | ||
| 638 : | $output = ""; | ||
| 639 : | |||
| 640 : | // Cycle through each Unicode character number | ||
| 641 : | foreach( $unicode_array as $unicode_char ) | ||
| 642 : | { | ||
| 643 : | // Check which range the current unicode character lies in | ||
| 644 : | if ( ( $unicode_char >= 0x00 ) && ( $unicode_char <= 0x7F ) ) | ||
| 645 : | { | ||
| 646 : | // 1 Byte UTF-8 Unicode (7-Bit ASCII) Character | ||
| 647 : | |||
| 648 : | $output .= chr($unicode_char); // Output is equal to input for 7-bit ASCII | ||
| 649 : | } | ||
| 650 : | else if ( ( $unicode_char >= 0x80 ) && ( $unicode_char <= 0x7FF ) ) | ||
| 651 : | { | ||
| 652 : | // 2 Byte UTF-8 Unicode - binary encode data as : 110xxxxx 10xxxxxx | ||
| 653 : | |||
| 654 : | $output .= chr(0xC0 + ($unicode_char/0x40)); | ||
| 655 : | $output .= chr(0x80 + ($unicode_char & 0x3F)); | ||
| 656 : | } | ||
| 657 : | else if ( ( $unicode_char >= 0x800 ) && ( $unicode_char <= 0xFFFF ) ) | ||
| 658 : | { | ||
| 659 : | // 3 Byte UTF-8 Unicode - binary encode data as : 1110xxxx 10xxxxxx 10xxxxxx | ||
| 660 : | |||
| 661 : | $output .= chr(0xE0 + ($unicode_char/0x1000)); | ||
| 662 : | $output .= chr(0x80 + (($unicode_char/0x40) & 0x3F)); | ||
| 663 : | $output .= chr(0x80 + ($unicode_char & 0x3F)); | ||
| 664 : | } | ||
| 665 : | else if ( ( $unicode_char >= 0x10000 ) && ( $unicode_char <= 0x1FFFFF ) ) | ||
| 666 : | { | ||
| 667 : | // 4 Byte UTF-8 Unicode - binary encode data as : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | ||
| 668 : | |||
| 669 : | $output .= chr(0xF0 + ($unicode_char/0x40000)); | ||
| 670 : | $output .= chr(0x80 + (($unicode_char/0x1000) & 0x3F)); | ||
| 671 : | $output .= chr(0x80 + (($unicode_char/0x40) & 0x3F)); | ||
| 672 : | $output .= chr(0x80 + ($unicode_char & 0x3F)); | ||
| 673 : | } | ||
| 674 : | else if ( ( $unicode_char >= 0x200000 ) && ( $unicode_char <= 0x3FFFFFF ) ) | ||
| 675 : | { | ||
| 676 : | // 5 Byte UTF-8 Unicode - binary encode data as : 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | ||
| 677 : | |||
| 678 : | $output .= chr(0xF8 + ($unicode_char/0x1000000)); | ||
| 679 : | $output .= chr(0x80 + (($unicode_char/0x40000) & 0x3F)); | ||
| 680 : | $output .= chr(0x80 + (($unicode_char/0x1000) & 0x3F)); | ||
| 681 : | $output .= chr(0x80 + (($unicode_char/0x40) & 0x3F)); | ||
| 682 : | $output .= chr(0x80 + ($unicode_char & 0x3F)); | ||
| 683 : | } | ||
| 684 : | else if ( ( $unicode_char >= 0x4000000 ) && ( $unicode_char <= 0x7FFFFFFF ) ) | ||
| 685 : | { | ||
| 686 : | // 6 Byte UTF-8 Unicode - binary encode data as : 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | ||
| 687 : | |||
| 688 : | $output .= chr(0xFC + ($unicode_char/0x40000000)); | ||
| 689 : | $output .= chr(0x80 + (($unicode_char/0x1000000) & 0x3F)); | ||
| 690 : | $output .= chr(0x80 + (($unicode_char/0x40000) & 0x3F)); | ||
| 691 : | $output .= chr(0x80 + (($unicode_char/0x1000) & 0x3F)); | ||
| 692 : | $output .= chr(0x80 + (($unicode_char/0x40) & 0x3F)); | ||
| 693 : | $output .= chr(0x80 + ($unicode_char & 0x3F)); | ||
| 694 : | } | ||
| 695 : | else | ||
| 696 : | { | ||
| 697 : | // Invalid Code - do nothing | ||
| 698 : | } | ||
| 699 : | |||
| 700 : | } | ||
| 701 : | |||
| 702 : | // Return resulting UTF-8 String | ||
| 703 : | return $output; | ||
| 704 : | } | ||
| 705 : | |||
| 706 : | /****************************************************************************** | ||
| 707 : | * End of Function: unicode_array_to_UTF8 | ||
| 708 : | ******************************************************************************/ | ||
| 709 : | |||
| 710 : | |||
| 711 : | |||
| 712 : | |||
| 713 : | |||
| 714 : | |||
| 715 : | |||
| 716 : | |||
| 717 : | |||
| 718 : | /****************************************************************************** | ||
| 719 : | * | ||
| 720 : | * Function: unicode_array_to_UTF16 | ||
| 721 : | * | ||
| 722 : | * Description: Converts an array of unicode character numbers to a string | ||
| 723 : | * encoded by UTF-16 | ||
| 724 : | * | ||
| 725 : | * Parameters: unicode_array - the array containing unicode character numbers | ||
| 726 : | * MSB_first - True will cause processing as Big Endian UTF-16 (Motorola, MSB first) | ||
| 727 : | * False will cause processing as Little Endian UTF-16 (Intel, LSB first) | ||
| 728 : | * | ||
| 729 : | * Returns: output - the UTF-16 encoded string representing the data | ||
| 730 : | * | ||
| 731 : | ******************************************************************************/ | ||
| 732 : | |||
| 733 : | function unicode_array_to_UTF16( $unicode_array, $MSB_first ) | ||
| 734 : | { | ||
| 735 : | |||
| 736 : | // Create a string to receive the UTF-16 output | ||
| 737 : | $output = ""; | ||
| 738 : | |||
| 739 : | // Cycle through each Unicode character number | ||
| 740 : | foreach( $unicode_array as $unicode_char ) | ||
| 741 : | { | ||
| 742 : | // Check which range the current unicode character lies in | ||
| 743 : | if ( ( ( $unicode_char >= 0x0000 ) && ( $unicode_char <= 0xD7FF ) ) || | ||
| 744 : | ( ( $unicode_char >= 0xE000 ) && ( $unicode_char <= 0xFFFF ) ) ) | ||
| 745 : | { | ||
| 746 : | // Normal 16 Bit Character (Not a Surrogate Pair) | ||
| 747 : | |||
| 748 : | // Check what byte order should be used | ||
| 749 : | if ( $MSB_first ) | ||
| 750 : | { | ||
| 751 : | // Big Endian | ||
| 752 : | $output .= chr( $unicode_char / 0x100 ) . chr( $unicode_char % 0x100 ) ; | ||
| 753 : | } | ||
| 754 : | else | ||
| 755 : | { | ||
| 756 : | // Little Endian | ||
| 757 : | $output .= chr( $unicode_char % 0x100 ) . chr( $unicode_char / 0x100 ) ; | ||
| 758 : | } | ||
| 759 : | |||
| 760 : | } | ||
| 761 : | else if ( ( $unicode_char >= 0x10000 ) && ( $unicode_char <= 0x10FFFF ) ) | ||
| 762 : | { | ||
| 763 : | // Surrogate Pair required | ||
| 764 : | |||
| 765 : | // Calculate Surrogates | ||
| 766 : | $High_Surrogate = ( ( $unicode_char - 0x10000 ) / 0x400 ) + 0xD800; | ||
| 767 : | $Low_Surrogate = ( ( $unicode_char - 0x10000 ) % 0x400 ) + 0xDC00; | ||
| 768 : | |||
| 769 : | // Check what byte order should be used | ||
| 770 : | if ( $MSB_first ) | ||
| 771 : | { | ||
| 772 : | // Big Endian | ||
| 773 : | $output .= chr( $High_Surrogate / 0x100 ) . chr( $High_Surrogate % 0x100 ); | ||
| 774 : | $output .= chr( $Low_Surrogate / 0x100 ) . chr( $Low_Surrogate % 0x100 ); | ||
| 775 : | } | ||
| 776 : | else | ||
| 777 : | { | ||
| 778 : | // Little Endian | ||
| 779 : | $output .= chr( $High_Surrogate % 0x100 ) . chr( $High_Surrogate / 0x100 ); | ||
| 780 : | $output .= chr( $Low_Surrogate % 0x100 ) . chr( $Low_Surrogate / 0x100 ); | ||
| 781 : | } | ||
| 782 : | } | ||
| 783 : | else | ||
| 784 : | { | ||
| 785 : | // Invalid UTF-16 codepoint | ||
| 786 : | // Unicode value should never be between 0xD800 and 0xDFFF | ||
| 787 : | // Do not output this point - there is no way to encode it in UTF-16 | ||
| 788 : | } | ||
| 789 : | |||
| 790 : | } | ||
| 791 : | |||
| 792 : | // Return resulting UTF-16 String | ||
| 793 : | return $output; | ||
| 794 : | } | ||
| 795 : | |||
| 796 : | /****************************************************************************** | ||
| 797 : | * End of Function: unicode_array_to_UTF16 | ||
| 798 : | ******************************************************************************/ | ||
| 799 : | |||
| 800 : | |||
| 801 : | |||
| 802 : | |||
| 803 : | |||
| 804 : | /****************************************************************************** | ||
| 805 : | * | ||
| 806 : | * Function: xml_UTF8_clean | ||
| 807 : | * | ||
| 808 : | * Description: XML has specific requirements about the characters that are | ||
| 809 : | * allowed, and characters that must be escaped. | ||
| 810 : | * This function ensures that all characters in the given string | ||
| 811 : | * are valid, and that characters such as Quotes, Greater than, | ||
| 812 : | * Less than and Ampersand are properly escaped. Newlines and Tabs | ||
| 813 : | * are also escaped. | ||
| 814 : | * Note - Do not use this on constructed XML which includes tags, | ||
| 815 : | * as it will escape the tags. It is designed to be used | ||
| 816 : | * on the tag and attribute names, attribute values, and text. | ||
| 817 : | * | ||
| 818 : | * Parameters: utf8_text - a string containing the UTF-8 data | ||
| 819 : | * | ||
| 820 : | * Returns: output - the array containing the unicode character numbers | ||
| 821 : | * | ||
| 822 : | ******************************************************************************/ | ||
| 823 : | |||
| 824 : | function xml_UTF8_clean( $UTF8_text ) | ||
| 825 : | { | ||
| 826 : | // Ensure that the Unicode UTF8 encoding is valid. | ||
| 827 : | |||
| 828 : | $UTF8_text = UTF8_fix( $UTF8_text ); | ||
| 829 : | |||
| 830 : | |||
| 831 : | // XML only allows characters in the following unicode ranges | ||
| 832 : | // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] | ||
| 833 : | // Hence we need to delete any characters that dont fit this | ||
| 834 : | |||
| 835 : | // Convert the UTF-8 string to an array of unicode character numbers | ||
| 836 : | $unicode_array = UTF8_to_unicode_array( $UTF8_text ); | ||
| 837 : | |||
| 838 : | // Create a new array to receive the valid unicode character numbers | ||
| 839 : | $new_unicode_array = array( ); | ||
| 840 : | |||
| 841 : | // Cycle through the unicode character numbers | ||
| 842 : | foreach( $unicode_array as $unichar ) | ||
| 843 : | { | ||
| 844 : | // Check if the unicode character number is valid for XML | ||
| 845 : | if ( ( $unichar == 0x09 ) || | ||
| 846 : | ( $unichar == 0x0A ) || | ||
| 847 : | ( $unichar == 0x0D ) || | ||
| 848 : | ( ( $unichar >= 0x20 ) && ( $unichar <= 0xD7FF ) ) || | ||
| 849 : | ( ( $unichar >= 0xE000 ) && ( $unichar <= 0xFFFD ) ) || | ||
| 850 : | ( ( $unichar >= 0x10000 ) && ( $unichar <= 0x10FFFF ) ) ) | ||
| 851 : | { | ||
| 852 : | // Unicode character is valid for XML - add it to the valid characters array | ||
| 853 : | $new_unicode_array[] = $unichar; | ||
| 854 : | } | ||
| 855 : | |||
| 856 : | } | ||
| 857 : | |||
| 858 : | // Convert the array of valid unicode character numbers back to UTF-8 encoded text | ||
| 859 : | $UTF8_text = unicode_array_to_UTF8( $new_unicode_array ); | ||
| 860 : | |||
| 861 : | // Escape any special HTML characters present | ||
| 862 : | $UTF8_text = htmlspecialchars ( $UTF8_text, ENT_QUOTES ); | ||
| 863 : | |||
| 864 : | // Escape CR, LF and TAB characters, so that they are kept and not treated as expendable white space | ||
| 865 : | $trans = array( "\x09" => "	", "\x0A" => "
", "\x0D" => "
" ); | ||
| 866 : | $UTF8_text = strtr( $UTF8_text, $trans ); | ||
| 867 : | |||
| 868 : | // Return the resulting XML valid string | ||
| 869 : | return $UTF8_text; | ||
| 870 : | } | ||
| 871 : | |||
| 872 : | /****************************************************************************** | ||
| 873 : | * End of Function: xml_UTF8_clean | ||
| 874 : | ******************************************************************************/ | ||
| 875 : | |||
| 876 : | |||
| 877 : | |||
| 878 : | |||
| 879 : | |||
| 880 : | |||
| 881 : | |||
| 882 : | |||
| 883 : | |||
| 884 : | /****************************************************************************** | ||
| 885 : | * | ||
| 886 : | * Function: xml_UTF16_clean | ||
| 887 : | * | ||
| 888 : | * Description: XML has specific requirements about the characters that are | ||
| 889 : | * allowed, and characters that must be escaped. | ||
| 890 : | * This function ensures that all characters in the given string | ||
| 891 : | * are valid, and that characters such as Quotes, Greater than, | ||
| 892 : | * Less than and Ampersand are properly escaped. Newlines and Tabs | ||
| 893 : | * are also escaped. | ||
| 894 : | * Note - Do not use this on constructed XML which includes tags, | ||
| 895 : | * as it will escape the tags. It is designed to be used | ||
| 896 : | * on the tag and attribute names, attribute values, and text. | ||
| 897 : | * | ||
| 898 : | * Parameters: utf16_text - a string containing the UTF-16 data | ||
| 899 : | * MSB_first - True will cause processing as Big Endian UTF-16 (Motorola, MSB first) | ||
| 900 : | * False will cause processing as Little Endian UTF-16 (Intel, LSB first) | ||
| 901 : | * | ||
| 902 : | * Returns: output - the array containing the unicode character numbers | ||
| 903 : | * | ||
| 904 : | ******************************************************************************/ | ||
| 905 : | |||
| 906 : | function xml_UTF16_clean( $UTF16_text, $MSB_first ) | ||
| 907 : | { | ||
| 908 : | // Ensure that the Unicode UTF16 encoding is valid. | ||
| 909 : | |||
| 910 : | $UTF16_text = UTF16_fix( $UTF16_text, $MSB_first ); | ||
| 911 : | |||
| 912 : | |||
| 913 : | // XML only allows characters in the following unicode ranges | ||
| 914 : | // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] | ||
| 915 : | // Hence we need to delete any characters that dont fit this | ||
| 916 : | |||
| 917 : | // Convert the UTF-16 string to an array of unicode character numbers | ||
| 918 : | $unicode_array = UTF16_to_unicode_array( $UTF16_text, $MSB_first ); | ||
| 919 : | |||
| 920 : | // Create a new array to receive the valid unicode character numbers | ||
| 921 : | $new_unicode_array = array( ); | ||
| 922 : | |||
| 923 : | // Cycle through the unicode character numbers | ||
| 924 : | foreach( $unicode_array as $unichar ) | ||
| 925 : | { | ||
| 926 : | // Check if the unicode character number is valid for XML | ||
| 927 : | if ( ( $unichar == 0x09 ) || | ||
| 928 : | ( $unichar == 0x0A ) || | ||
| 929 : | ( $unichar == 0x0D ) || | ||
| 930 : | ( ( $unichar >= 0x20 ) && ( $unichar <= 0xD7FF ) ) || | ||
| 931 : | ( ( $unichar >= 0xE000 ) && ( $unichar <= 0xFFFD ) ) || | ||
| 932 : | ( ( $unichar >= 0x10000 ) && ( $unichar <= 0x10FFFF ) ) ) | ||
| 933 : | { | ||
| 934 : | // Unicode character is valid for XML - add it to the valid characters array | ||
| 935 : | $new_unicode_array[] = $unichar; | ||
| 936 : | } | ||
| 937 : | |||
| 938 : | } | ||
| 939 : | |||
| 940 : | // Convert the array of valid unicode character numbers back to UTF-16 encoded text | ||
| 941 : | $UTF16_text = unicode_array_to_UTF16( $new_unicode_array, $MSB_first ); | ||
| 942 : | |||
| 943 : | // Escape any special HTML characters present | ||
| 944 : | $UTF16_text = htmlspecialchars ( $UTF16_text, ENT_QUOTES ); | ||
| 945 : | |||
| 946 : | // Escape CR, LF and TAB characters, so that they are kept and not treated as expendable white space | ||
| 947 : | $trans = array( "\x09" => "	", "\x0A" => "
", "\x0D" => "
" ); | ||
| 948 : | $UTF16_text = strtr( $UTF16_text, $trans ); | ||
| 949 : | |||
| 950 : | // Return the resulting XML valid string | ||
| 951 : | return $UTF16_text; | ||
| 952 : | } | ||
| 953 : | |||
| 954 : | /****************************************************************************** | ||
| 955 : | * End of Function: xml_UTF16_clean | ||
| 956 : | ******************************************************************************/ | ||
| 957 : | |||
| 958 : | |||
| 959 : | |||
| 960 : | |||
| 961 : | |||
| 962 : | |||
| 963 : | /****************************************************************************** | ||
| 964 : | * | ||
| 965 : | * Function: HTML_UTF8_Escape | ||
| 966 : | * | ||
| 967 : | * Description: A HTML page can display UTF-8 data properly if it has a | ||
| 968 : | * META http-equiv="Content-Type" tag with the content attribute | ||
| 969 : | * including the value: "charset=utf-8". | ||
| 970 : | * Otherwise the ISO-8859-1 character set is usually assumed, and | ||
| 971 : | * Unicode values above 0x7F must be escaped. | ||
| 972 : | * This function takes a UTF-8 encoded string and escapes the | ||
| 973 : | * characters above 0x7F as well as reserved HTML characters such | ||
| 974 : | * as Quotes, Greater than, Less than and Ampersand. | ||
| 975 : | * | ||
| 976 : | * Parameters: utf8_text - a string containing the UTF-8 data | ||
| 977 : | * | ||
| 978 : | * Returns: htmloutput - a string containing the HTML equivalent | ||
| 979 : | * | ||
| 980 : | ******************************************************************************/ | ||
| 981 : | |||
| 982 : | function HTML_UTF8_Escape( $UTF8_text ) | ||
| 983 : | { | ||
| 984 : | |||
| 985 : | // Ensure that the Unicode UTF8 encoding is valid. | ||
| 986 : | $UTF8_text = UTF8_fix( $UTF8_text ); | ||
| 987 : | |||
| 988 : | // Change: changed to use smart_htmlspecialchars, so that characters which were already escaped would remain intact, as of revision 1.10 | ||
| 989 : | // Escape any special HTML characters present | ||
| 990 : | $UTF8_text = smart_htmlspecialchars( $UTF8_text, ENT_QUOTES ); | ||
| 991 : | |||
| 992 : | // Convert the UTF-8 string to an array of unicode character numbers | ||
| 993 : | $unicode_array = UTF8_to_unicode_array( $UTF8_text ); | ||
| 994 : | |||
| 995 : | // Create a string to receive the escaped HTML | ||
| 996 : | $htmloutput = ""; | ||
| 997 : | |||
| 998 : | // Cycle through the unicode character numbers | ||
| 999 : | foreach( $unicode_array as $unichar ) | ||
| 1000 : | { | ||
| 1001 : | // Check if the character needs to be escaped | ||
| 1002 : | if ( ( $unichar >= 0x00 ) && ( $unichar <= 0x7F ) ) | ||
| 1003 : | { | ||
| 1004 : | // Character is less than 0x7F - add it to the html as is | ||
| 1005 : | $htmloutput .= chr( $unichar ); | ||
| 1006 : | } | ||
| 1007 : | else | ||
| 1008 : | { | ||
| 1009 : | // Character is greater than 0x7F - escape it and add it to the html | ||
| 1010 : | $htmloutput .= "&#x" . dechex($unichar) . ";"; | ||
| 1011 : | } | ||
| 1012 : | } | ||
| 1013 : | |||
| 1014 : | // Return the resulting escaped HTML | ||
| 1015 : | return $htmloutput; | ||
| 1016 : | } | ||
| 1017 : | |||
| 1018 : | /****************************************************************************** | ||
| 1019 : | * End of Function: HTML_UTF8_Escape | ||
| 1020 : | ******************************************************************************/ | ||
| 1021 : | |||
| 1022 : | |||
| 1023 : | |||
| 1024 : | /****************************************************************************** | ||
| 1025 : | * | ||
| 1026 : | * Function: HTML_UTF8_UnEscape | ||
| 1027 : | * | ||
| 1028 : | * Description: Converts HTML which contains escaped decimal or hex characters | ||
| 1029 : | * into UTF-8 text | ||
| 1030 : | * | ||
| 1031 : | * Parameters: HTML_text - a string containing the HTML text to convert | ||
| 1032 : | * | ||
| 1033 : | * Returns: utfoutput - a string containing the UTF-8 equivalent | ||
| 1034 : | * | ||
| 1035 : | ******************************************************************************/ | ||
| 1036 : | |||
| 1037 : | function HTML_UTF8_UnEscape( $HTML_text ) | ||
| 1038 : | { | ||
| 1039 : | preg_match_all( "/\&\#(\d+);/", $HTML_text, $matches); | ||
| 1040 : | preg_match_all( "/\&\#[x|X]([A|B|C|D|E|F|a|b|c|d|e|f|0-9]+);/", $HTML_text, $hexmatches); | ||
| 1041 : | foreach( $hexmatches[1] as $index => $match ) | ||
| 1042 : | { | ||
| 1043 : | $matches[0][] = $hexmatches[0][$index]; | ||
| 1044 : | $matches[1][] = hexdec( $match ); | ||
| 1045 : | } | ||
| 1046 : | |||
| 1047 : | for ( $i = 0; $i < count( $matches[ 0 ] ); $i++ ) | ||
| 1048 : | { | ||
| 1049 : | $trans = array( $matches[0][$i] => unicode_array_to_UTF8( array( $matches[1][$i] ) ) ); | ||
| 1050 : | |||
| 1051 : | $HTML_text = strtr( $HTML_text , $trans ); | ||
| 1052 : | } | ||
| 1053 : | return $HTML_text; | ||
| 1054 : | } | ||
| 1055 : | |||
| 1056 : | /****************************************************************************** | ||
| 1057 : | * End of Function: HTML_UTF8_UnEscape | ||
| 1058 : | ******************************************************************************/ | ||
| 1059 : | |||
| 1060 : | |||
| 1061 : | |||
| 1062 : | |||
| 1063 : | |||
| 1064 : | |||
| 1065 : | /****************************************************************************** | ||
| 1066 : | * | ||
| 1067 : | * Function: HTML_UTF16_Escape | ||
| 1068 : | * | ||
| 1069 : | * Description: A HTML page can display UTF-16 data properly if it has a | ||
| 1070 : | * META http-equiv="Content-Type" tag with the content attribute | ||
| 1071 : | * including the value: "charset=utf-16". | ||
| 1072 : | * Otherwise the ISO-8859-1 character set is usually assumed, and | ||
| 1073 : | * Unicode values above 0x7F must be escaped. | ||
| 1074 : | * This function takes a UTF-16 encoded string and escapes the | ||
| 1075 : | * characters above 0x7F as well as reserved HTML characters such | ||
| 1076 : | * as Quotes, Greater than, Less than and Ampersand. | ||
| 1077 : | * | ||
| 1078 : | * Parameters: utf16_text - a string containing the UTF-16 data | ||
| 1079 : | * MSB_first - True will cause processing as Big Endian UTF-16 (Motorola, MSB first) | ||
| 1080 : | * False will cause processing as Little Endian UTF-16 (Intel, LSB first) | ||
| 1081 : | * | ||
| 1082 : | * Returns: htmloutput - a string containing the HTML equivalent | ||
| 1083 : | * | ||
| 1084 : | ******************************************************************************/ | ||
| 1085 : | |||
| 1086 : | function HTML_UTF16_Escape( $UTF16_text, $MSB_first ) | ||
| 1087 : | { | ||
| 1088 : | |||
| 1089 : | // Ensure that the Unicode UTF16 encoding is valid. | ||
| 1090 : | $UTF16_text = UTF16_fix( $UTF16_text, $MSB_first ); | ||
| 1091 : | |||
| 1092 : | // Change: changed to use smart_htmlspecialchars, so that characters which were already escaped would remain intact, as of revision 1.10 | ||
| 1093 : | // Escape any special HTML characters present | ||
| 1094 : | $UTF16_text = smart_htmlspecialchars( $UTF16_text ); | ||
| 1095 : | |||
| 1096 : | // Convert the UTF-16 string to an array of unicode character numbers | ||
| 1097 : | $unicode_array = UTF16_to_unicode_array( $UTF16_text, $MSB_first ); | ||
| 1098 : | |||
| 1099 : | // Create a string to receive the escaped HTML | ||
| 1100 : | $htmloutput = ""; | ||
| 1101 : | |||
| 1102 : | // Cycle through the unicode character numbers | ||
| 1103 : | foreach( $unicode_array as $unichar ) | ||
| 1104 : | { | ||
| 1105 : | // Check if the character needs to be escaped | ||
| 1106 : | if ( ( $unichar >= 0x00 ) && ( $unichar <= 0x7F ) ) | ||
| 1107 : | { | ||
| 1108 : | // Character is less than 0x7F - add it to the html as is | ||
| 1109 : | $htmloutput .= chr( $unichar ); | ||
| 1110 : | } | ||
| 1111 : | else | ||
| 1112 : | { | ||
| 1113 : | // Character is greater than 0x7F - escape it and add it to the html | ||
| 1114 : | $htmloutput .= "&#x" . dechex($unichar) . ";"; | ||
| 1115 : | } | ||
| 1116 : | } | ||
| 1117 : | |||
| 1118 : | // Return the resulting escaped HTML | ||
| 1119 : | return $htmloutput; | ||
| 1120 : | } | ||
| 1121 : | |||
| 1122 : | /****************************************************************************** | ||
| 1123 : | * End of Function: HTML_UTF16_Escape | ||
| 1124 : | ******************************************************************************/ | ||
| 1125 : | |||
| 1126 : | |||
| 1127 : | /****************************************************************************** | ||
| 1128 : | * | ||
| 1129 : | * Function: HTML_UTF16_UnEscape | ||
| 1130 : | * | ||
| 1131 : | * Description: Converts HTML which contains escaped decimal or hex characters | ||
| 1132 : | * into UTF-16 text | ||
| 1133 : | * | ||
| 1134 : | * Parameters: HTML_text - a string containing the HTML text to be converted | ||
| 1135 : | * MSB_first - True will cause processing as Big Endian UTF-16 (Motorola, MSB first) | ||
| 1136 : | * False will cause processing as Little Endian UTF-16 (Intel, LSB first) | ||
| 1137 : | * | ||
| 1138 : | * Returns: utfoutput - a string containing the UTF-16 equivalent | ||
| 1139 : | * | ||
| 1140 : | ******************************************************************************/ | ||
| 1141 : | |||
| 1142 : | function HTML_UTF16_UnEscape( $HTML_text, $MSB_first ) | ||
| 1143 : | { | ||
| 1144 : | $utf8_text = HTML_UTF8_UnEscape( $HTML_text ); | ||
| 1145 : | |||
| 1146 : | return unicode_array_to_UTF16( UTF8_to_unicode_array( $utf8_text ), $MSB_first ); | ||
| 1147 : | } | ||
| 1148 : | |||
| 1149 : | /****************************************************************************** | ||
| 1150 : | * End of Function: HTML_UTF16_UnEscape | ||
| 1151 : | ******************************************************************************/ | ||
| 1152 : | |||
| 1153 : | |||
| 1154 : | |||
| 1155 : | |||
| 1156 : | /****************************************************************************** | ||
| 1157 : | * | ||
| 1158 : | * Function: smart_HTML_Entities | ||
| 1159 : | * | ||
| 1160 : | * Description: Performs the same function as HTML_Entities, but leaves entities | ||
| 1161 : | * that are already escaped intact. | ||
| 1162 : | * | ||
| 1163 : | * Parameters: HTML_text - a string containing the HTML text to be escaped | ||
| 1164 : | * | ||
| 1165 : | * Returns: HTML_text_out - a string containing the escaped HTML text | ||
| 1166 : | * | ||
| 1167 : | ******************************************************************************/ | ||
| 1168 : | |||
| 1169 : | function smart_HTML_Entities( $HTML_text ) | ||
| 1170 : | { | ||
| 1171 : | // Get a table containing the HTML entities translations | ||
| 1172 : | $translation_table = get_html_translation_table( HTML_ENTITIES ); | ||
| 1173 : | |||
| 1174 : | // Change the ampersand to translate to itself, to avoid getting & | ||
| 1175 : | $translation_table[ chr(38) ] = '&'; | ||
| 1176 : | |||
| 1177 : | // Perform replacements | ||
| 1178 : | // Regular expression says: find an ampersand, check the text after it, | ||
| 1179 : | // if the text after it is not one of the following, then replace the ampersand | ||
| 1180 : | // with & | ||
| 1181 : | // a) any combination of up to 4 letters (upper or lower case) with at least 2 or 3 non whitespace characters, then a semicolon | ||
| 1182 : | // b) a hash symbol, then between 2 and 7 digits | ||
| 1183 : | // c) a hash symbol, an 'x' character, then between 2 and 7 digits | ||
| 1184 : | // d) a hash symbol, an 'X' character, then between 2 and 7 digits | ||
| 1185 : | return preg_replace( "/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,7}|#x[0-9]{2,7}|#X[0-9]{2,7};)/","&" , strtr( $HTML_text, $translation_table ) ); | ||
| 1186 : | } | ||
| 1187 : | |||
| 1188 : | /****************************************************************************** | ||
| 1189 : | * End of Function: smart_HTML_Entities | ||
| 1190 : | ******************************************************************************/ | ||
| 1191 : | |||
| 1192 : | |||
| 1193 : | |||
| 1194 : | /****************************************************************************** | ||
| 1195 : | * | ||
| 1196 : | * Function: smart_htmlspecialchars | ||
| 1197 : | * | ||
| 1198 : | * Description: Performs the same function as htmlspecialchars, but leaves characters | ||
| 1199 : | * that are already escaped intact. | ||
| 1200 : | * | ||
| 1201 : | * Parameters: HTML_text - a string containing the HTML text to be escaped | ||
| 1202 : | * | ||
| 1203 : | * Returns: HTML_text_out - a string containing the escaped HTML text | ||
| 1204 : | * | ||
| 1205 : | ******************************************************************************/ | ||
| 1206 : | |||
| 1207 : | function smart_htmlspecialchars( $HTML_text ) | ||
| 1208 : | { | ||
| 1209 : | // Get a table containing the HTML special characters translations | ||
| 1210 : | $translation_table=get_html_translation_table (HTML_SPECIALCHARS); | ||
| 1211 : | |||
| 1212 : | // Change the ampersand to translate to itself, to avoid getting & | ||
| 1213 : | $translation_table[ chr(38) ] = '&'; | ||
| 1214 : | |||
| 1215 : | // Perform replacements | ||
| 1216 : | // Regular expression says: find an ampersand, check the text after it, | ||
| 1217 : | // if the text after it is not one of the following, then replace the ampersand | ||
| 1218 : | // with & | ||
| 1219 : | // a) any combination of up to 4 letters (upper or lower case) with at least 2 or 3 non whitespace characters, then a semicolon | ||
| 1220 : | // b) a hash symbol, then between 2 and 7 digits | ||
| 1221 : | // c) a hash symbol, an 'x' character, then between 2 and 7 digits | ||
| 1222 : | // d) a hash symbol, an 'X' character, then between 2 and 7 digits | ||
| 1223 : | return preg_replace( "/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,7}|#x[0-9]{2,7}|#X[0-9]{2,7};)/","&" , strtr( $HTML_text, $translation_table ) ); | ||
| 1224 : | } | ||
| 1225 : | |||
| 1226 : | /****************************************************************************** | ||
| 1227 : | * End of Function: smart_htmlspecialchars | ||
| 1228 : | ******************************************************************************/ | ||
| 1229 : | |||
| 1230 : | |||
| 1231 : | ?> |
| ViewVC Help | |
| Powered by ViewVC 1.0.0 |
Web Hosting provided by Network Redux.

