Data.php 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. <?php
  2. // warning: this file is encoded in UTF-8!
  3. class HTML5_Data
  4. {
  5. // at some point this should be moved to a .ser file. Another
  6. // possible optimization is to give UTF-8 bytes, not Unicode
  7. // codepoints
  8. // XXX: Not quite sure why it's named this; this is
  9. // actually the numeric entity dereference table.
  10. protected static $realCodepointTable = [
  11. 0x00 => 0xFFFD, // REPLACEMENT CHARACTER
  12. 0x0D => 0x000A, // LINE FEED (LF)
  13. 0x80 => 0x20AC, // EURO SIGN ('€')
  14. 0x81 => 0x0081, // <control>
  15. 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚')
  16. 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
  17. 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
  18. 0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
  19. 0x86 => 0x2020, // DAGGER ('†')
  20. 0x87 => 0x2021, // DOUBLE DAGGER ('‡')
  21. 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
  22. 0x89 => 0x2030, // PER MILLE SIGN ('‰')
  23. 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
  24. 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
  25. 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
  26. 0x8D => 0x008D, // <control>
  27. 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
  28. 0x8F => 0x008F, // <control>
  29. 0x90 => 0x0090, // <control>
  30. 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
  31. 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
  32. 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
  33. 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
  34. 0x95 => 0x2022, // BULLET ('•')
  35. 0x96 => 0x2013, // EN DASH ('–')
  36. 0x97 => 0x2014, // EM DASH ('—')
  37. 0x98 => 0x02DC, // SMALL TILDE ('˜')
  38. 0x99 => 0x2122, // TRADE MARK SIGN ('™')
  39. 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
  40. 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
  41. 0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
  42. 0x9D => 0x009D, // <control>
  43. 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
  44. 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
  45. ];
  46. protected static $namedCharacterReferences;
  47. protected static $namedCharacterReferenceMaxLength;
  48. /**
  49. * Returns the "real" Unicode codepoint of a malformed character
  50. * reference.
  51. */
  52. public static function getRealCodepoint($ref) {
  53. if (!isset(self::$realCodepointTable[$ref])) {
  54. return false;
  55. } else {
  56. return self::$realCodepointTable[$ref];
  57. }
  58. }
  59. public static function getNamedCharacterReferences() {
  60. if (!self::$namedCharacterReferences) {
  61. self::$namedCharacterReferences = unserialize(
  62. file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
  63. }
  64. return self::$namedCharacterReferences;
  65. }
  66. /**
  67. * Converts a Unicode codepoint to sequence of UTF-8 bytes.
  68. * @note Shamelessly stolen from HTML Purifier, which is also
  69. * shamelessly stolen from Feyd (which is in public domain).
  70. */
  71. public static function utf8chr($code) {
  72. /* We don't care: we live dangerously
  73. * if($code > 0x10FFFF or $code < 0x0 or
  74. ($code >= 0xD800 and $code <= 0xDFFF) ) {
  75. // bits are set outside the "valid" range as defined
  76. // by UNICODE 4.1.0
  77. return "\xEF\xBF\xBD";
  78. }*/
  79. $y = $z = $w = 0;
  80. if ($code < 0x80) {
  81. // regular ASCII character
  82. $x = $code;
  83. } else {
  84. // set up bits for UTF-8
  85. $x = ($code & 0x3F) | 0x80;
  86. if ($code < 0x800) {
  87. $y = (($code & 0x7FF) >> 6) | 0xC0;
  88. } else {
  89. $y = (($code & 0xFC0) >> 6) | 0x80;
  90. if ($code < 0x10000) {
  91. $z = (($code >> 12) & 0x0F) | 0xE0;
  92. } else {
  93. $z = (($code >> 12) & 0x3F) | 0x80;
  94. $w = (($code >> 18) & 0x07) | 0xF0;
  95. }
  96. }
  97. }
  98. // set up the actual character
  99. $ret = '';
  100. if ($w) {
  101. $ret .= chr($w);
  102. }
  103. if ($z) {
  104. $ret .= chr($z);
  105. }
  106. if ($y) {
  107. $ret .= chr($y);
  108. }
  109. $ret .= chr($x);
  110. return $ret;
  111. }
  112. }