诸暨麻将添加redis
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

960 lines
35 KiB

  1. // Protocol Buffers - Google's data interchange format
  2. // Copyright 2008 Google Inc. All rights reserved.
  3. // https://developers.google.com/protocol-buffers/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are
  7. // met:
  8. //
  9. // * Redistributions of source code must retain the above copyright
  10. // notice, this list of conditions and the following disclaimer.
  11. // * Redistributions in binary form must reproduce the above
  12. // copyright notice, this list of conditions and the following disclaimer
  13. // in the documentation and/or other materials provided with the
  14. // distribution.
  15. // * Neither the name of Google Inc. nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. // Author: kenton@google.com (Kenton Varda)
  31. // Based on original Protocol Buffers design by
  32. // Sanjay Ghemawat, Jeff Dean, and others.
  33. #include <google/protobuf/io/tokenizer.h>
  34. #include <limits.h>
  35. #include <math.h>
  36. #include <vector>
  37. #include <google/protobuf/stubs/common.h>
  38. #include <google/protobuf/stubs/logging.h>
  39. #include <google/protobuf/stubs/strutil.h>
  40. #include <google/protobuf/stubs/substitute.h>
  41. #include <google/protobuf/io/zero_copy_stream_impl.h>
  42. #include <google/protobuf/testing/googletest.h>
  43. #include <gtest/gtest.h>
  44. namespace google {
  45. namespace protobuf {
  46. namespace io {
  47. namespace {
  48. // ===================================================================
  49. // Data-Driven Test Infrastructure
  50. // TODO(kenton): This is copied from coded_stream_unittest. This is
  51. // temporary until these fetaures are integrated into gTest itself.
  52. // TEST_1D and TEST_2D are macros I'd eventually like to see added to
  53. // gTest. These macros can be used to declare tests which should be
  54. // run multiple times, once for each item in some input array. TEST_1D
  55. // tests all cases in a single input array. TEST_2D tests all
  56. // combinations of cases from two arrays. The arrays must be statically
  57. // defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example:
  58. //
  59. // int kCases[] = {1, 2, 3, 4}
  60. // TEST_1D(MyFixture, MyTest, kCases) {
  61. // EXPECT_GT(kCases_case, 0);
  62. // }
  63. //
  64. // This test iterates through the numbers 1, 2, 3, and 4 and tests that
  65. // they are all grater than zero. In case of failure, the exact case
  66. // which failed will be printed. The case type must be printable using
  67. // ostream::operator<<.
  68. #define TEST_1D(FIXTURE, NAME, CASES) \
  69. class FIXTURE##_##NAME##_DD : public FIXTURE { \
  70. protected: \
  71. template <typename CaseType> \
  72. void DoSingleCase(const CaseType& CASES##_case); \
  73. }; \
  74. \
  75. TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
  76. for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \
  77. SCOPED_TRACE(testing::Message() \
  78. << #CASES " case #" << i << ": " << CASES[i]); \
  79. DoSingleCase(CASES[i]); \
  80. } \
  81. } \
  82. \
  83. template <typename CaseType> \
  84. void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
  85. #define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
  86. class FIXTURE##_##NAME##_DD : public FIXTURE { \
  87. protected: \
  88. template <typename CaseType1, typename CaseType2> \
  89. void DoSingleCase(const CaseType1& CASES1##_case, \
  90. const CaseType2& CASES2##_case); \
  91. }; \
  92. \
  93. TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
  94. for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \
  95. for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \
  96. SCOPED_TRACE(testing::Message() \
  97. << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
  98. << #CASES2 " case #" << j << ": " << CASES2[j]); \
  99. DoSingleCase(CASES1[i], CASES2[j]); \
  100. } \
  101. } \
  102. } \
  103. \
  104. template <typename CaseType1, typename CaseType2> \
  105. void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
  106. const CaseType2& CASES2##_case)
  107. // -------------------------------------------------------------------
  108. // An input stream that is basically like an ArrayInputStream but sometimes
  109. // returns empty buffers, just to throw us off.
  110. class TestInputStream : public ZeroCopyInputStream {
  111. public:
  112. TestInputStream(const void* data, int size, int block_size)
  113. : array_stream_(data, size, block_size), counter_(0) {}
  114. ~TestInputStream() {}
  115. // implements ZeroCopyInputStream ----------------------------------
  116. bool Next(const void** data, int* size) override {
  117. // We'll return empty buffers starting with the first buffer, and every
  118. // 3 and 5 buffers after that.
  119. if (counter_ % 3 == 0 || counter_ % 5 == 0) {
  120. *data = NULL;
  121. *size = 0;
  122. ++counter_;
  123. return true;
  124. } else {
  125. ++counter_;
  126. return array_stream_.Next(data, size);
  127. }
  128. }
  129. void BackUp(int count) override { return array_stream_.BackUp(count); }
  130. bool Skip(int count) override { return array_stream_.Skip(count); }
  131. int64_t ByteCount() const override { return array_stream_.ByteCount(); }
  132. private:
  133. ArrayInputStream array_stream_;
  134. int counter_;
  135. };
  136. // -------------------------------------------------------------------
  137. // An error collector which simply concatenates all its errors into a big
  138. // block of text which can be checked.
  139. class TestErrorCollector : public ErrorCollector {
  140. public:
  141. TestErrorCollector() {}
  142. ~TestErrorCollector() {}
  143. std::string text_;
  144. // implements ErrorCollector ---------------------------------------
  145. void AddError(int line, int column, const std::string& message) {
  146. strings::SubstituteAndAppend(&text_, "$0:$1: $2\n", line, column, message);
  147. }
  148. };
  149. // -------------------------------------------------------------------
  150. // We test each operation over a variety of block sizes to insure that
  151. // we test cases where reads cross buffer boundaries as well as cases
  152. // where they don't. This is sort of a brute-force approach to this,
  153. // but it's easy to write and easy to understand.
  154. const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
  155. class TokenizerTest : public testing::Test {
  156. protected:
  157. // For easy testing.
  158. uint64 ParseInteger(const std::string& text) {
  159. uint64 result;
  160. EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
  161. return result;
  162. }
  163. };
  164. // ===================================================================
  165. // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
  166. // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
  167. #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
  168. // In each test case, the entire input text should parse as a single token
  169. // of the given type.
  170. struct SimpleTokenCase {
  171. std::string input;
  172. Tokenizer::TokenType type;
  173. };
  174. inline std::ostream& operator<<(std::ostream& out,
  175. const SimpleTokenCase& test_case) {
  176. return out << CEscape(test_case.input);
  177. }
  178. SimpleTokenCase kSimpleTokenCases[] = {
  179. // Test identifiers.
  180. {"hello", Tokenizer::TYPE_IDENTIFIER},
  181. // Test integers.
  182. {"123", Tokenizer::TYPE_INTEGER},
  183. {"0xab6", Tokenizer::TYPE_INTEGER},
  184. {"0XAB6", Tokenizer::TYPE_INTEGER},
  185. {"0X1234567", Tokenizer::TYPE_INTEGER},
  186. {"0x89abcdef", Tokenizer::TYPE_INTEGER},
  187. {"0x89ABCDEF", Tokenizer::TYPE_INTEGER},
  188. {"01234567", Tokenizer::TYPE_INTEGER},
  189. // Test floats.
  190. {"123.45", Tokenizer::TYPE_FLOAT},
  191. {"1.", Tokenizer::TYPE_FLOAT},
  192. {"1e3", Tokenizer::TYPE_FLOAT},
  193. {"1E3", Tokenizer::TYPE_FLOAT},
  194. {"1e-3", Tokenizer::TYPE_FLOAT},
  195. {"1e+3", Tokenizer::TYPE_FLOAT},
  196. {"1.e3", Tokenizer::TYPE_FLOAT},
  197. {"1.2e3", Tokenizer::TYPE_FLOAT},
  198. {".1", Tokenizer::TYPE_FLOAT},
  199. {".1e3", Tokenizer::TYPE_FLOAT},
  200. {".1e-3", Tokenizer::TYPE_FLOAT},
  201. {".1e+3", Tokenizer::TYPE_FLOAT},
  202. // Test strings.
  203. {"'hello'", Tokenizer::TYPE_STRING},
  204. {"\"foo\"", Tokenizer::TYPE_STRING},
  205. {"'a\"b'", Tokenizer::TYPE_STRING},
  206. {"\"a'b\"", Tokenizer::TYPE_STRING},
  207. {"'a\\'b'", Tokenizer::TYPE_STRING},
  208. {"\"a\\\"b\"", Tokenizer::TYPE_STRING},
  209. {"'\\xf'", Tokenizer::TYPE_STRING},
  210. {"'\\0'", Tokenizer::TYPE_STRING},
  211. // Test symbols.
  212. {"+", Tokenizer::TYPE_SYMBOL},
  213. {".", Tokenizer::TYPE_SYMBOL},
  214. };
  215. TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
  216. // Set up the tokenizer.
  217. TestInputStream input(kSimpleTokenCases_case.input.data(),
  218. kSimpleTokenCases_case.input.size(), kBlockSizes_case);
  219. TestErrorCollector error_collector;
  220. Tokenizer tokenizer(&input, &error_collector);
  221. // Before Next() is called, the initial token should always be TYPE_START.
  222. EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
  223. EXPECT_EQ("", tokenizer.current().text);
  224. EXPECT_EQ(0, tokenizer.current().line);
  225. EXPECT_EQ(0, tokenizer.current().column);
  226. EXPECT_EQ(0, tokenizer.current().end_column);
  227. // Parse the token.
  228. ASSERT_TRUE(tokenizer.Next());
  229. // Check that it has the right type.
  230. EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
  231. // Check that it contains the complete input text.
  232. EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
  233. // Check that it is located at the beginning of the input
  234. EXPECT_EQ(0, tokenizer.current().line);
  235. EXPECT_EQ(0, tokenizer.current().column);
  236. EXPECT_EQ(kSimpleTokenCases_case.input.size(),
  237. tokenizer.current().end_column);
  238. // There should be no more input.
  239. EXPECT_FALSE(tokenizer.Next());
  240. // After Next() returns false, the token should have type TYPE_END.
  241. EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
  242. EXPECT_EQ("", tokenizer.current().text);
  243. EXPECT_EQ(0, tokenizer.current().line);
  244. EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
  245. EXPECT_EQ(kSimpleTokenCases_case.input.size(),
  246. tokenizer.current().end_column);
  247. // There should be no errors.
  248. EXPECT_TRUE(error_collector.text_.empty());
  249. }
  250. TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
  251. // Test the "allow_f_after_float" option.
  252. // Set up the tokenizer.
  253. const char* text = "1f 2.5f 6e3f 7F";
  254. TestInputStream input(text, strlen(text), kBlockSizes_case);
  255. TestErrorCollector error_collector;
  256. Tokenizer tokenizer(&input, &error_collector);
  257. tokenizer.set_allow_f_after_float(true);
  258. // Advance through tokens and check that they are parsed as expected.
  259. ASSERT_TRUE(tokenizer.Next());
  260. EXPECT_EQ(tokenizer.current().text, "1f");
  261. EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  262. ASSERT_TRUE(tokenizer.Next());
  263. EXPECT_EQ(tokenizer.current().text, "2.5f");
  264. EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  265. ASSERT_TRUE(tokenizer.Next());
  266. EXPECT_EQ(tokenizer.current().text, "6e3f");
  267. EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  268. ASSERT_TRUE(tokenizer.Next());
  269. EXPECT_EQ(tokenizer.current().text, "7F");
  270. EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  271. // There should be no more input.
  272. EXPECT_FALSE(tokenizer.Next());
  273. // There should be no errors.
  274. EXPECT_TRUE(error_collector.text_.empty());
  275. }
  276. #endif
  277. // -------------------------------------------------------------------
  278. // In each case, the input is parsed to produce a list of tokens. The
  279. // last token in "output" must have type TYPE_END.
  280. struct MultiTokenCase {
  281. std::string input;
  282. Tokenizer::Token output[10]; // The compiler wants a constant array
  283. // size for initialization to work. There
  284. // is no reason this can't be increased if
  285. // needed.
  286. };
  287. inline std::ostream& operator<<(std::ostream& out,
  288. const MultiTokenCase& test_case) {
  289. return out << CEscape(test_case.input);
  290. }
  291. MultiTokenCase kMultiTokenCases[] = {
  292. // Test empty input.
  293. {"",
  294. {
  295. {Tokenizer::TYPE_END, "", 0, 0, 0},
  296. }},
  297. // Test all token types at the same time.
  298. {"foo 1 1.2 + 'bar'",
  299. {
  300. {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
  301. {Tokenizer::TYPE_INTEGER, "1", 0, 4, 5},
  302. {Tokenizer::TYPE_FLOAT, "1.2", 0, 6, 9},
  303. {Tokenizer::TYPE_SYMBOL, "+", 0, 10, 11},
  304. {Tokenizer::TYPE_STRING, "'bar'", 0, 12, 17},
  305. {Tokenizer::TYPE_END, "", 0, 17, 17},
  306. }},
  307. // Test that consecutive symbols are parsed as separate tokens.
  308. {"!@+%",
  309. {
  310. {Tokenizer::TYPE_SYMBOL, "!", 0, 0, 1},
  311. {Tokenizer::TYPE_SYMBOL, "@", 0, 1, 2},
  312. {Tokenizer::TYPE_SYMBOL, "+", 0, 2, 3},
  313. {Tokenizer::TYPE_SYMBOL, "%", 0, 3, 4},
  314. {Tokenizer::TYPE_END, "", 0, 4, 4},
  315. }},
  316. // Test that newlines affect line numbers correctly.
  317. {"foo bar\nrab oof",
  318. {
  319. {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
  320. {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7},
  321. {Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3},
  322. {Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7},
  323. {Tokenizer::TYPE_END, "", 1, 7, 7},
  324. }},
  325. // Test that tabs affect column numbers correctly.
  326. {"foo\tbar \tbaz",
  327. {
  328. {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
  329. {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11},
  330. {Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19},
  331. {Tokenizer::TYPE_END, "", 0, 19, 19},
  332. }},
  333. // Test that tabs in string literals affect column numbers correctly.
  334. {"\"foo\tbar\" baz",
  335. {
  336. {Tokenizer::TYPE_STRING, "\"foo\tbar\"", 0, 0, 12},
  337. {Tokenizer::TYPE_IDENTIFIER, "baz", 0, 13, 16},
  338. {Tokenizer::TYPE_END, "", 0, 16, 16},
  339. }},
  340. // Test that line comments are ignored.
  341. {"foo // This is a comment\n"
  342. "bar // This is another comment",
  343. {
  344. {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
  345. {Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3},
  346. {Tokenizer::TYPE_END, "", 1, 30, 30},
  347. }},
  348. // Test that block comments are ignored.
  349. {"foo /* This is a block comment */ bar",
  350. {
  351. {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
  352. {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37},
  353. {Tokenizer::TYPE_END, "", 0, 37, 37},
  354. }},
  355. // Test that sh-style comments are not ignored by default.
  356. {"foo # bar\n"
  357. "baz",
  358. {
  359. {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
  360. {Tokenizer::TYPE_SYMBOL, "#", 0, 4, 5},
  361. {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9},
  362. {Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3},
  363. {Tokenizer::TYPE_END, "", 1, 3, 3},
  364. }},
  365. // Test all whitespace chars
  366. {"foo\n\t\r\v\fbar",
  367. {
  368. {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
  369. {Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14},
  370. {Tokenizer::TYPE_END, "", 1, 14, 14},
  371. }},
  372. };
  373. TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
  374. // Set up the tokenizer.
  375. TestInputStream input(kMultiTokenCases_case.input.data(),
  376. kMultiTokenCases_case.input.size(), kBlockSizes_case);
  377. TestErrorCollector error_collector;
  378. Tokenizer tokenizer(&input, &error_collector);
  379. // Before Next() is called, the initial token should always be TYPE_START.
  380. EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
  381. EXPECT_EQ("", tokenizer.current().text);
  382. EXPECT_EQ(0, tokenizer.current().line);
  383. EXPECT_EQ(0, tokenizer.current().column);
  384. EXPECT_EQ(0, tokenizer.current().end_column);
  385. // Loop through all expected tokens.
  386. int i = 0;
  387. Tokenizer::Token token;
  388. do {
  389. token = kMultiTokenCases_case.output[i++];
  390. SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
  391. Tokenizer::Token previous = tokenizer.current();
  392. // Next() should only return false when it hits the end token.
  393. if (token.type != Tokenizer::TYPE_END) {
  394. ASSERT_TRUE(tokenizer.Next());
  395. } else {
  396. ASSERT_FALSE(tokenizer.Next());
  397. }
  398. // Check that the previous token is set correctly.
  399. EXPECT_EQ(previous.type, tokenizer.previous().type);
  400. EXPECT_EQ(previous.text, tokenizer.previous().text);
  401. EXPECT_EQ(previous.line, tokenizer.previous().line);
  402. EXPECT_EQ(previous.column, tokenizer.previous().column);
  403. EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
  404. // Check that the token matches the expected one.
  405. EXPECT_EQ(token.type, tokenizer.current().type);
  406. EXPECT_EQ(token.text, tokenizer.current().text);
  407. EXPECT_EQ(token.line, tokenizer.current().line);
  408. EXPECT_EQ(token.column, tokenizer.current().column);
  409. EXPECT_EQ(token.end_column, tokenizer.current().end_column);
  410. } while (token.type != Tokenizer::TYPE_END);
  411. // There should be no errors.
  412. EXPECT_TRUE(error_collector.text_.empty());
  413. }
  414. // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
  415. // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
  416. #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
  417. TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
  418. // Test the "comment_style" option.
  419. const char* text =
  420. "foo # bar\n"
  421. "baz // qux\n"
  422. "corge /* grault */\n"
  423. "garply";
  424. const char* const kTokens[] = {"foo", // "# bar" is ignored
  425. "baz", "/", "/", "qux", "corge", "/",
  426. "*", "grault", "*", "/", "garply"};
  427. // Set up the tokenizer.
  428. TestInputStream input(text, strlen(text), kBlockSizes_case);
  429. TestErrorCollector error_collector;
  430. Tokenizer tokenizer(&input, &error_collector);
  431. tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
  432. // Advance through tokens and check that they are parsed as expected.
  433. for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
  434. EXPECT_TRUE(tokenizer.Next());
  435. EXPECT_EQ(tokenizer.current().text, kTokens[i]);
  436. }
  437. // There should be no more input.
  438. EXPECT_FALSE(tokenizer.Next());
  439. // There should be no errors.
  440. EXPECT_TRUE(error_collector.text_.empty());
  441. }
  442. #endif
  443. // -------------------------------------------------------------------
  444. // In each case, the input is expected to have two tokens named "prev" and
  445. // "next" with comments in between.
  446. struct DocCommentCase {
  447. std::string input;
  448. const char* prev_trailing_comments;
  449. const char* detached_comments[10];
  450. const char* next_leading_comments;
  451. };
  452. inline std::ostream& operator<<(std::ostream& out,
  453. const DocCommentCase& test_case) {
  454. return out << CEscape(test_case.input);
  455. }
  456. DocCommentCase kDocCommentCases[] = {
  457. {"prev next",
  458. "",
  459. {},
  460. ""},
  461. {"prev /* ignored */ next",
  462. "",
  463. {},
  464. ""},
  465. {"prev // trailing comment\n"
  466. "next",
  467. " trailing comment\n",
  468. {},
  469. ""},
  470. {"prev\n"
  471. "// leading comment\n"
  472. "// line 2\n"
  473. "next",
  474. "",
  475. {},
  476. " leading comment\n"
  477. " line 2\n"},
  478. {"prev\n"
  479. "// trailing comment\n"
  480. "// line 2\n"
  481. "\n"
  482. "next",
  483. " trailing comment\n"
  484. " line 2\n",
  485. {},
  486. ""},
  487. {"prev // trailing comment\n"
  488. "// leading comment\n"
  489. "// line 2\n"
  490. "next",
  491. " trailing comment\n",
  492. {},
  493. " leading comment\n"
  494. " line 2\n"},
  495. {"prev /* trailing block comment */\n"
  496. "/* leading block comment\n"
  497. " * line 2\n"
  498. " * line 3 */"
  499. "next",
  500. " trailing block comment ",
  501. {},
  502. " leading block comment\n"
  503. " line 2\n"
  504. " line 3 "},
  505. {"prev\n"
  506. "/* trailing block comment\n"
  507. " * line 2\n"
  508. " * line 3\n"
  509. " */\n"
  510. "/* leading block comment\n"
  511. " * line 2\n"
  512. " * line 3 */"
  513. "next",
  514. " trailing block comment\n"
  515. " line 2\n"
  516. " line 3\n",
  517. {},
  518. " leading block comment\n"
  519. " line 2\n"
  520. " line 3 "},
  521. {"prev\n"
  522. "// trailing comment\n"
  523. "\n"
  524. "// detached comment\n"
  525. "// line 2\n"
  526. "\n"
  527. "// second detached comment\n"
  528. "/* third detached comment\n"
  529. " * line 2 */\n"
  530. "// leading comment\n"
  531. "next",
  532. " trailing comment\n",
  533. {" detached comment\n"
  534. " line 2\n",
  535. " second detached comment\n",
  536. " third detached comment\n"
  537. " line 2 "},
  538. " leading comment\n"},
  539. {"prev /**/\n"
  540. "\n"
  541. "// detached comment\n"
  542. "\n"
  543. "// leading comment\n"
  544. "next",
  545. "",
  546. {" detached comment\n"},
  547. " leading comment\n"},
  548. {"prev /**/\n"
  549. "// leading comment\n"
  550. "next",
  551. "",
  552. {},
  553. " leading comment\n"},
  554. };
  555. TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
  556. // Set up the tokenizer.
  557. TestInputStream input(kDocCommentCases_case.input.data(),
  558. kDocCommentCases_case.input.size(), kBlockSizes_case);
  559. TestErrorCollector error_collector;
  560. Tokenizer tokenizer(&input, &error_collector);
  561. // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
  562. TestInputStream input2(kDocCommentCases_case.input.data(),
  563. kDocCommentCases_case.input.size(), kBlockSizes_case);
  564. Tokenizer tokenizer2(&input2, &error_collector);
  565. tokenizer.Next();
  566. tokenizer2.Next();
  567. EXPECT_EQ("prev", tokenizer.current().text);
  568. EXPECT_EQ("prev", tokenizer2.current().text);
  569. std::string prev_trailing_comments;
  570. std::vector<std::string> detached_comments;
  571. std::string next_leading_comments;
  572. tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
  573. &next_leading_comments);
  574. tokenizer2.NextWithComments(NULL, NULL, NULL);
  575. EXPECT_EQ("next", tokenizer.current().text);
  576. EXPECT_EQ("next", tokenizer2.current().text);
  577. EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
  578. prev_trailing_comments);
  579. for (int i = 0; i < detached_comments.size(); i++) {
  580. ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
  581. ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
  582. EXPECT_EQ(kDocCommentCases_case.detached_comments[i], detached_comments[i]);
  583. }
  584. // Verify that we matched all the detached comments.
  585. EXPECT_EQ(NULL,
  586. kDocCommentCases_case.detached_comments[detached_comments.size()]);
  587. EXPECT_EQ(kDocCommentCases_case.next_leading_comments, next_leading_comments);
  588. }
  589. // -------------------------------------------------------------------
  590. // Test parse helpers. It's not really worth setting up a full data-driven
  591. // test here.
  592. TEST_F(TokenizerTest, ParseInteger) {
  593. EXPECT_EQ(0, ParseInteger("0"));
  594. EXPECT_EQ(123, ParseInteger("123"));
  595. EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
  596. EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
  597. EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
  598. EXPECT_EQ(01234567, ParseInteger("01234567"));
  599. EXPECT_EQ(0X123, ParseInteger("0X123"));
  600. // Test invalid integers that may still be tokenized as integers.
  601. EXPECT_EQ(0, ParseInteger("0x"));
  602. uint64 i;
  603. // Test invalid integers that will never be tokenized as integers.
  604. EXPECT_FALSE(Tokenizer::ParseInteger("zxy", kuint64max, &i));
  605. EXPECT_FALSE(Tokenizer::ParseInteger("1.2", kuint64max, &i));
  606. EXPECT_FALSE(Tokenizer::ParseInteger("08", kuint64max, &i));
  607. EXPECT_FALSE(Tokenizer::ParseInteger("0xg", kuint64max, &i));
  608. EXPECT_FALSE(Tokenizer::ParseInteger("-1", kuint64max, &i));
  609. // Test overflows.
  610. EXPECT_TRUE(Tokenizer::ParseInteger("0", 0, &i));
  611. EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
  612. EXPECT_TRUE(Tokenizer::ParseInteger("1", 1, &i));
  613. EXPECT_TRUE(Tokenizer::ParseInteger("12345", 12345, &i));
  614. EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
  615. EXPECT_TRUE(Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF", kuint64max, &i));
  616. EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
  617. }
  618. TEST_F(TokenizerTest, ParseFloat) {
  619. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1."));
  620. EXPECT_DOUBLE_EQ(1e3, Tokenizer::ParseFloat("1e3"));
  621. EXPECT_DOUBLE_EQ(1e3, Tokenizer::ParseFloat("1E3"));
  622. EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
  623. EXPECT_DOUBLE_EQ(.1, Tokenizer::ParseFloat(".1"));
  624. EXPECT_DOUBLE_EQ(.25, Tokenizer::ParseFloat(".25"));
  625. EXPECT_DOUBLE_EQ(.1e3, Tokenizer::ParseFloat(".1e3"));
  626. EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
  627. EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
  628. EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
  629. EXPECT_DOUBLE_EQ(5, Tokenizer::ParseFloat("5"));
  630. EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
  631. EXPECT_DOUBLE_EQ(1.2, Tokenizer::ParseFloat("1.2"));
  632. EXPECT_DOUBLE_EQ(1.e2, Tokenizer::ParseFloat("1.e2"));
  633. // Test invalid integers that may still be tokenized as integers.
  634. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
  635. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
  636. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
  637. // Test 'f' suffix.
  638. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
  639. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
  640. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
  641. // These should parse successfully even though they are out of range.
  642. // Overflows become infinity and underflows become zero.
  643. EXPECT_EQ(0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
  644. EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
  645. #ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet
  646. // Test invalid integers that will never be tokenized as integers.
  647. EXPECT_DEBUG_DEATH(
  648. Tokenizer::ParseFloat("zxy"),
  649. "passed text that could not have been tokenized as a float");
  650. EXPECT_DEBUG_DEATH(
  651. Tokenizer::ParseFloat("1-e0"),
  652. "passed text that could not have been tokenized as a float");
  653. EXPECT_DEBUG_DEATH(
  654. Tokenizer::ParseFloat("-1.0"),
  655. "passed text that could not have been tokenized as a float");
  656. #endif // PROTOBUF_HAS_DEATH_TEST
  657. }
  658. TEST_F(TokenizerTest, ParseString) {
  659. std::string output;
  660. Tokenizer::ParseString("'hello'", &output);
  661. EXPECT_EQ("hello", output);
  662. Tokenizer::ParseString("\"blah\\nblah2\"", &output);
  663. EXPECT_EQ("blah\nblah2", output);
  664. Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
  665. EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
  666. Tokenizer::ParseString("'\\x20\\x4'", &output);
  667. EXPECT_EQ("\x20\x4", output);
  668. // Test invalid strings that may still be tokenized as strings.
  669. Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid
  670. EXPECT_EQ("\a?\v\t", output);
  671. Tokenizer::ParseString("'", &output);
  672. EXPECT_EQ("", output);
  673. Tokenizer::ParseString("'\\", &output);
  674. EXPECT_EQ("\\", output);
  675. // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
  676. // characters.
  677. Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
  678. EXPECT_EQ("$¢€𤭢XX", output);
  679. // Same thing encoded using UTF16.
  680. Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
  681. EXPECT_EQ("$¢€𤭢XX", output);
  682. // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
  683. // We just output this as if it were UTF8; it's not a defined code point, but
  684. // it has a defined encoding.
  685. Tokenizer::ParseString("'\\ud852XX'", &output);
  686. EXPECT_EQ("\xed\xa1\x92XX", output);
  687. // Malformed escape: Demons may fly out of the nose.
  688. Tokenizer::ParseString("\\u0", &output);
  689. EXPECT_EQ("u0", output);
  690. // Test invalid strings that will never be tokenized as strings.
  691. #ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet
  692. EXPECT_DEBUG_DEATH(
  693. Tokenizer::ParseString("", &output),
  694. "passed text that could not have been tokenized as a string");
  695. #endif // PROTOBUF_HAS_DEATH_TEST
  696. }
  697. TEST_F(TokenizerTest, ParseStringAppend) {
  698. // Check that ParseString and ParseStringAppend differ.
  699. std::string output("stuff+");
  700. Tokenizer::ParseStringAppend("'hello'", &output);
  701. EXPECT_EQ("stuff+hello", output);
  702. Tokenizer::ParseString("'hello'", &output);
  703. EXPECT_EQ("hello", output);
  704. }
  705. // -------------------------------------------------------------------
  706. // Each case parses some input text, ignoring the tokens produced, and
  707. // checks that the error output matches what is expected.
  708. struct ErrorCase {
  709. std::string input;
  710. bool recoverable; // True if the tokenizer should be able to recover and
  711. // parse more tokens after seeing this error. Cases
  712. // for which this is true must end with "foo" as
  713. // the last token, which the test will check for.
  714. const char* errors;
  715. };
  716. inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) {
  717. return out << CEscape(test_case.input);
  718. }
  719. ErrorCase kErrorCases[] = {
  720. // String errors.
  721. {"'\\l' foo", true, "0:2: Invalid escape sequence in string literal.\n"},
  722. {"'\\X' foo", true, "0:2: Invalid escape sequence in string literal.\n"},
  723. {"'\\x' foo", true, "0:3: Expected hex digits for escape sequence.\n"},
  724. {"'foo", false, "0:4: Unexpected end of string.\n"},
  725. {"'bar\nfoo", true, "0:4: String literals cannot cross line boundaries.\n"},
  726. {"'\\u01' foo", true,
  727. "0:5: Expected four hex digits for \\u escape sequence.\n"},
  728. {"'\\u01' foo", true,
  729. "0:5: Expected four hex digits for \\u escape sequence.\n"},
  730. {"'\\uXYZ' foo", true,
  731. "0:3: Expected four hex digits for \\u escape sequence.\n"},
  732. // Integer errors.
  733. {"123foo", true, "0:3: Need space between number and identifier.\n"},
  734. // Hex/octal errors.
  735. {"0x foo", true, "0:2: \"0x\" must be followed by hex digits.\n"},
  736. {"0541823 foo", true,
  737. "0:4: Numbers starting with leading zero must be in octal.\n"},
  738. {"0x123z foo", true, "0:5: Need space between number and identifier.\n"},
  739. {"0x123.4 foo", true, "0:5: Hex and octal numbers must be integers.\n"},
  740. {"0123.4 foo", true, "0:4: Hex and octal numbers must be integers.\n"},
  741. // Float errors.
  742. {"1e foo", true, "0:2: \"e\" must be followed by exponent.\n"},
  743. {"1e- foo", true, "0:3: \"e\" must be followed by exponent.\n"},
  744. {"1.2.3 foo", true,
  745. "0:3: Already saw decimal point or exponent; can't have another one.\n"},
  746. {"1e2.3 foo", true,
  747. "0:3: Already saw decimal point or exponent; can't have another one.\n"},
  748. {"a.1 foo", true,
  749. "0:1: Need space between identifier and decimal point.\n"},
  750. // allow_f_after_float not enabled, so this should be an error.
  751. {"1.0f foo", true, "0:3: Need space between number and identifier.\n"},
  752. // Block comment errors.
  753. {"/*", false,
  754. "0:2: End-of-file inside block comment.\n"
  755. "0:0: Comment started here.\n"},
  756. {"/*/*/ foo", true,
  757. "0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"},
  758. // Control characters. Multiple consecutive control characters should only
  759. // produce one error.
  760. {"\b foo", true, "0:0: Invalid control characters encountered in text.\n"},
  761. {"\b\b foo", true,
  762. "0:0: Invalid control characters encountered in text.\n"},
  763. // Check that control characters at end of input don't result in an
  764. // infinite loop.
  765. {"\b", false, "0:0: Invalid control characters encountered in text.\n"},
  766. // Check recovery from '\0'. We have to explicitly specify the length of
  767. // these strings because otherwise the string constructor will just call
  768. // strlen() which will see the first '\0' and think that is the end of the
  769. // string.
  770. {std::string("\0foo", 4), true,
  771. "0:0: Invalid control characters encountered in text.\n"},
  772. {std::string("\0\0foo", 5), true,
  773. "0:0: Invalid control characters encountered in text.\n"},
  774. // Check error from high order bits set
  775. {"\300foo", true, "0:0: Interpreting non ascii codepoint 192.\n"},
  776. };
  777. TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
  778. // Set up the tokenizer.
  779. TestInputStream input(kErrorCases_case.input.data(),
  780. kErrorCases_case.input.size(), kBlockSizes_case);
  781. TestErrorCollector error_collector;
  782. Tokenizer tokenizer(&input, &error_collector);
  783. // Ignore all input, except remember if the last token was "foo".
  784. bool last_was_foo = false;
  785. while (tokenizer.Next()) {
  786. last_was_foo = tokenizer.current().text == "foo";
  787. }
  788. // Check that the errors match what was expected.
  789. EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
  790. // If the error was recoverable, make sure we saw "foo" after it.
  791. if (kErrorCases_case.recoverable) {
  792. EXPECT_TRUE(last_was_foo);
  793. }
  794. }
  795. // -------------------------------------------------------------------
  796. TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
  797. std::string text = "foo bar";
  798. TestInputStream input(text.data(), text.size(), kBlockSizes_case);
  799. // Create a tokenizer, read one token, then destroy it.
  800. {
  801. TestErrorCollector error_collector;
  802. Tokenizer tokenizer(&input, &error_collector);
  803. tokenizer.Next();
  804. }
  805. // Only "foo" should have been read.
  806. EXPECT_EQ(strlen("foo"), input.ByteCount());
  807. }
  808. } // namespace
  809. } // namespace io
  810. } // namespace protobuf
  811. } // namespace google