诸暨麻将添加redis
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 
 
 
 

618 linhas
26 KiB

  1. // Protocol Buffers - Google's data interchange format
  2. // Copyright 2008 Google Inc. All rights reserved.
  3. // https://developers.google.com/protocol-buffers/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are
  7. // met:
  8. //
  9. // * Redistributions of source code must retain the above copyright
  10. // notice, this list of conditions and the following disclaimer.
  11. // * Redistributions in binary form must reproduce the above
  12. // copyright notice, this list of conditions and the following disclaimer
  13. // in the documentation and/or other materials provided with the
  14. // distribution.
  15. // * Neither the name of Google Inc. nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. // Author: jrm@google.com (Jim Meehan)
  31. #include <google/protobuf/stubs/common.h>
  32. #include <google/protobuf/stubs/stringpiece.h>
  33. namespace google {
  34. namespace protobuf {
  35. namespace internal {
  36. // These four-byte entries compactly encode how many bytes 0..255 to delete
  37. // in making a string replacement, how many bytes to add 0..255, and the offset
  38. // 0..64k-1 of the replacement string in remap_string.
  39. struct RemapEntry {
  40. uint8 delete_bytes;
  41. uint8 add_bytes;
  42. uint16 bytes_offset;
  43. };
  44. // Exit type codes for state tables. All but the first get stuffed into
  45. // signed one-byte entries. The first is only generated by executable code.
  46. // To distinguish from next-state entries, these must be contiguous and
  47. // all <= kExitNone
  48. typedef enum {
  49. kExitDstSpaceFull = 239,
  50. kExitIllegalStructure, // 240
  51. kExitOK, // 241
  52. kExitReject, // ...
  53. kExitReplace1,
  54. kExitReplace2,
  55. kExitReplace3,
  56. kExitReplace21,
  57. kExitReplace31,
  58. kExitReplace32,
  59. kExitReplaceOffset1,
  60. kExitReplaceOffset2,
  61. kExitReplace1S0,
  62. kExitSpecial,
  63. kExitDoAgain,
  64. kExitRejectAlt,
  65. kExitNone // 255
  66. } ExitReason;
  67. // This struct represents one entire state table. The three initialized byte
  68. // areas are state_table, remap_base, and remap_string. state0 and state0_size
  69. // give the byte offset and length within state_table of the initial state --
  70. // table lookups are expected to start and end in this state, but for
  71. // truncated UTF-8 strings, may end in a different state. These allow a quick
  72. // test for that condition. entry_shift is 8 for tables subscripted by a full
  73. // byte value and 6 for space-optimized tables subscripted by only six
  74. // significant bits in UTF-8 continuation bytes.
  75. typedef struct {
  76. const uint32 state0;
  77. const uint32 state0_size;
  78. const uint32 total_size;
  79. const int max_expand;
  80. const int entry_shift;
  81. const int bytes_per_entry;
  82. const uint32 losub;
  83. const uint32 hiadd;
  84. const uint8* state_table;
  85. const RemapEntry* remap_base;
  86. const uint8* remap_string;
  87. const uint8* fast_state;
  88. } UTF8StateMachineObj;
  89. typedef UTF8StateMachineObj UTF8ScanObj;
  90. #define X__ (kExitIllegalStructure)
  91. #define RJ_ (kExitReject)
  92. #define S1_ (kExitReplace1)
  93. #define S2_ (kExitReplace2)
  94. #define S3_ (kExitReplace3)
  95. #define S21 (kExitReplace21)
  96. #define S31 (kExitReplace31)
  97. #define S32 (kExitReplace32)
  98. #define T1_ (kExitReplaceOffset1)
  99. #define T2_ (kExitReplaceOffset2)
  100. #define S11 (kExitReplace1S0)
  101. #define SP_ (kExitSpecial)
  102. #define D__ (kExitDoAgain)
  103. #define RJA (kExitRejectAlt)
  104. // Entire table has 9 state blocks of 256 entries each
  105. static const unsigned int utf8acceptnonsurrogates_STATE0 = 0; // state[0]
  106. static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256; // =[1]
  107. static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
  108. static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
  109. static const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
  110. static const unsigned int utf8acceptnonsurrogates_BYTES = 1;
  111. static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
  112. static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
  113. static const uint8 utf8acceptnonsurrogates[] = {
  114. // state[0] 0x000000 Byte 1
  115. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  116. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  117. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  118. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  119. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  120. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  121. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  122. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  123. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  124. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  125. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  126. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  127. X__, X__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  128. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  129. 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 3, 3,
  130. 4, 5, 5, 5, 6, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  131. // state[1] 0x000080 Byte 2 of 2
  132. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  133. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  134. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  135. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  136. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  137. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  138. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  139. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  140. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  141. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  142. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  143. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  144. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  145. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  146. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  147. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  148. // state[2] 0x000000 Byte 2 of 3
  149. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  150. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  151. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  152. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  153. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  154. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  155. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  156. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  157. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  158. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  159. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  160. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  161. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  162. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  163. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  164. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  165. // state[3] 0x001000 Byte 2 of 3
  166. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  167. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  168. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  169. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  170. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  171. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  172. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  173. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  174. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  175. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  176. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  177. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  178. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  179. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  180. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  181. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  182. // state[4] 0x000000 Byte 2 of 4
  183. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  184. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  185. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  186. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  187. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  188. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  189. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  190. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  191. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  192. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  193. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  194. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  195. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  196. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  197. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  198. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  199. // state[5] 0x040000 Byte 2 of 4
  200. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  201. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  202. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  203. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  204. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  205. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  206. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  207. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  208. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  209. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  210. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  211. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  212. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  213. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  214. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  215. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  216. // state[6] 0x100000 Byte 2 of 4
  217. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  218. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  219. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  220. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  221. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  222. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  223. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  224. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  225. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  226. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  227. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  228. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  229. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  230. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  231. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  232. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  233. // state[7] 0x00d000 Byte 2 of 3
  234. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  235. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  236. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  237. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  238. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  239. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  240. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  241. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  242. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  243. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  244. 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
  245. 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
  246. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  247. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  248. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  249. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  250. // state[8] 0x00d800 Byte 3 of 3
  251. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  252. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  253. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  254. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  255. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  256. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  257. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  258. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  259. RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
  260. RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
  261. RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
  262. RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
  263. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  264. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  265. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  266. X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
  267. };
  268. // Remap base[0] = (del, add, string_offset)
  269. static const RemapEntry utf8acceptnonsurrogates_remap_base[] = {
  270. {0, 0, 0} };
  271. // Remap string[0]
  272. static const unsigned char utf8acceptnonsurrogates_remap_string[] = {
  273. 0 };
  274. static const unsigned char utf8acceptnonsurrogates_fast[256] = {
  275. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  276. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  277. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  278. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  279. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  280. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  281. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  282. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  283. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  284. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  285. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  286. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  287. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  288. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  289. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  290. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  291. };
  292. static const UTF8ScanObj utf8acceptnonsurrogates_obj = {
  293. utf8acceptnonsurrogates_STATE0,
  294. utf8acceptnonsurrogates_STATE0_SIZE,
  295. utf8acceptnonsurrogates_TOTAL_SIZE,
  296. utf8acceptnonsurrogates_MAX_EXPAND_X4,
  297. utf8acceptnonsurrogates_SHIFT,
  298. utf8acceptnonsurrogates_BYTES,
  299. utf8acceptnonsurrogates_LOSUB,
  300. utf8acceptnonsurrogates_HIADD,
  301. utf8acceptnonsurrogates,
  302. utf8acceptnonsurrogates_remap_base,
  303. utf8acceptnonsurrogates_remap_string,
  304. utf8acceptnonsurrogates_fast
  305. };
  306. #undef X__
  307. #undef RJ_
  308. #undef S1_
  309. #undef S2_
  310. #undef S3_
  311. #undef S21
  312. #undef S31
  313. #undef S32
  314. #undef T1_
  315. #undef T2_
  316. #undef S11
  317. #undef SP_
  318. #undef D__
  319. #undef RJA
  320. // Return true if current Tbl pointer is within state0 range
  321. // Note that unsigned compare checks both ends of range simultaneously
  322. static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
  323. const uint8* Tbl0 = &st->state_table[st->state0];
  324. return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
  325. }
  326. // Scan a UTF-8 string based on state table.
  327. // Always scan complete UTF-8 characters
  328. // Set number of bytes scanned. Return reason for exiting
  329. int UTF8GenericScan(const UTF8ScanObj* st,
  330. const char * str,
  331. int str_length,
  332. int* bytes_consumed) {
  333. *bytes_consumed = 0;
  334. if (str_length == 0) return kExitOK;
  335. int eshift = st->entry_shift;
  336. const uint8* isrc = reinterpret_cast<const uint8*>(str);
  337. const uint8* src = isrc;
  338. const uint8* srclimit = isrc + str_length;
  339. const uint8* srclimit8 = srclimit - 7;
  340. const uint8* Tbl_0 = &st->state_table[st->state0];
  341. DoAgain:
  342. // Do state-table scan
  343. int e = 0;
  344. uint8 c;
  345. const uint8* Tbl2 = &st->fast_state[0];
  346. const uint32 losub = st->losub;
  347. const uint32 hiadd = st->hiadd;
  348. // Check initial few bytes one at a time until 8-byte aligned
  349. //----------------------------
  350. while ((((uintptr_t)src & 0x07) != 0) &&
  351. (src < srclimit) &&
  352. Tbl2[src[0]] == 0) {
  353. src++;
  354. }
  355. if (((uintptr_t)src & 0x07) == 0) {
  356. // Do fast for groups of 8 identity bytes.
  357. // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
  358. // including slowing slightly on cr/lf/ht
  359. //----------------------------
  360. while (src < srclimit8) {
  361. uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
  362. uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
  363. src += 8;
  364. // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
  365. uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
  366. (s4567 - losub) | (s4567 + hiadd);
  367. if ((temp & 0x80808080) != 0) {
  368. // We typically end up here on cr/lf/ht; src was incremented
  369. int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
  370. (Tbl2[src[-6]] | Tbl2[src[-5]]);
  371. if (e0123 != 0) {
  372. src -= 8;
  373. break;
  374. } // Exit on Non-interchange
  375. e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
  376. (Tbl2[src[-2]] | Tbl2[src[-1]]);
  377. if (e0123 != 0) {
  378. src -= 4;
  379. break;
  380. } // Exit on Non-interchange
  381. // Else OK, go around again
  382. }
  383. }
  384. }
  385. //----------------------------
  386. // Byte-at-a-time scan
  387. //----------------------------
  388. const uint8* Tbl = Tbl_0;
  389. while (src < srclimit) {
  390. c = *src;
  391. e = Tbl[c];
  392. src++;
  393. if (e >= kExitIllegalStructure) {break;}
  394. Tbl = &Tbl_0[e << eshift];
  395. }
  396. //----------------------------
  397. // Exit posibilities:
  398. // Some exit code, !state0, back up over last char
  399. // Some exit code, state0, back up one byte exactly
  400. // source consumed, !state0, back up over partial char
  401. // source consumed, state0, exit OK
  402. // For illegal byte in state0, avoid backup up over PREVIOUS char
  403. // For truncated last char, back up to beginning of it
  404. if (e >= kExitIllegalStructure) {
  405. // Back up over exactly one byte of rejected/illegal UTF-8 character
  406. src--;
  407. // Back up more if needed
  408. if (!InStateZero(st, Tbl)) {
  409. do {
  410. src--;
  411. } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
  412. }
  413. } else if (!InStateZero(st, Tbl)) {
  414. // Back up over truncated UTF-8 character
  415. e = kExitIllegalStructure;
  416. do {
  417. src--;
  418. } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
  419. } else {
  420. // Normal termination, source fully consumed
  421. e = kExitOK;
  422. }
  423. if (e == kExitDoAgain) {
  424. // Loop back up to the fast scan
  425. goto DoAgain;
  426. }
  427. *bytes_consumed = src - isrc;
  428. return e;
  429. }
  430. int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
  431. const char * str,
  432. int str_length,
  433. int* bytes_consumed) {
  434. *bytes_consumed = 0;
  435. if (str_length == 0) return kExitOK;
  436. const uint8* isrc = reinterpret_cast<const uint8*>(str);
  437. const uint8* src = isrc;
  438. const uint8* srclimit = isrc + str_length;
  439. const uint8* srclimit8 = srclimit - 7;
  440. int n;
  441. int rest_consumed;
  442. int exit_reason;
  443. do {
  444. // Check initial few bytes one at a time until 8-byte aligned
  445. while ((((uintptr_t)src & 0x07) != 0) &&
  446. (src < srclimit) && (src[0] < 0x80)) {
  447. src++;
  448. }
  449. if (((uintptr_t)src & 0x07) == 0) {
  450. while ((src < srclimit8) &&
  451. (((reinterpret_cast<const uint32*>(src)[0] |
  452. reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
  453. src += 8;
  454. }
  455. }
  456. while ((src < srclimit) && (src[0] < 0x80)) {
  457. src++;
  458. }
  459. // Run state table on the rest
  460. n = src - isrc;
  461. exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed);
  462. src += rest_consumed;
  463. } while ( exit_reason == kExitDoAgain );
  464. *bytes_consumed = src - isrc;
  465. return exit_reason;
  466. }
  467. // Hack: On some compilers the static tables are initialized at startup.
  468. // We can't use them until they are initialized. However, some Protocol
  469. // Buffer parsing happens at static init time and may try to validate
  470. // UTF-8 strings. Since UTF-8 validation is only used for debugging
  471. // anyway, we simply always return success if initialization hasn't
  472. // occurred yet.
  473. namespace {
  474. bool module_initialized_ = false;
  475. struct InitDetector {
  476. InitDetector() {
  477. module_initialized_ = true;
  478. }
  479. };
  480. InitDetector init_detector;
  481. } // namespace
  482. bool IsStructurallyValidUTF8(const char* buf, int len) {
  483. if (!module_initialized_) return true;
  484. int bytes_consumed = 0;
  485. UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
  486. buf, len, &bytes_consumed);
  487. return (bytes_consumed == len);
  488. }
  489. int UTF8SpnStructurallyValid(const StringPiece& str) {
  490. if (!module_initialized_) return str.size();
  491. int bytes_consumed = 0;
  492. UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
  493. str.data(), str.size(), &bytes_consumed);
  494. return bytes_consumed;
  495. }
  496. // Coerce UTF-8 byte string in src_str to be
  497. // a structurally-valid equal-length string by selectively
  498. // overwriting illegal bytes with replace_char (typically blank).
  499. // replace_char must be legal printable 7-bit Ascii 0x20..0x7e.
  500. // src_str is read-only. If any overwriting is needed, a modified byte string
  501. // is created in idst, length isrclen.
  502. //
  503. // Returns pointer to output buffer, isrc if no changes were made,
  504. // or idst if some bytes were changed.
  505. //
  506. // Fast case: all is structurally valid and no byte copying is done.
  507. //
  508. char* UTF8CoerceToStructurallyValid(const StringPiece& src_str,
  509. char* idst,
  510. const char replace_char) {
  511. const char* isrc = src_str.data();
  512. const int len = src_str.length();
  513. int n = UTF8SpnStructurallyValid(src_str);
  514. if (n == len) { // Normal case -- all is cool, return
  515. return const_cast<char*>(isrc);
  516. } else { // Unusual case -- copy w/o bad bytes
  517. const char* src = isrc;
  518. const char* srclimit = isrc + len;
  519. char* dst = idst;
  520. memmove(dst, src, n); // Copy initial good chunk
  521. src += n;
  522. dst += n;
  523. while (src < srclimit) { // src points to bogus byte or is off the end
  524. dst[0] = replace_char; // replace one bad byte
  525. src++;
  526. dst++;
  527. StringPiece str2(src, srclimit - src);
  528. n = UTF8SpnStructurallyValid(str2); // scan the remainder
  529. memmove(dst, src, n); // copy next good chunk
  530. src += n;
  531. dst += n;
  532. }
  533. }
  534. return idst;
  535. }
  536. } // namespace internal
  537. } // namespace protobuf
  538. } // namespace google