/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utils/utf8/unilib_test-include.h" #include "utils/base/logging.h" #include "gmock/gmock.h" namespace libtextclassifier3 { namespace test_internal { using ::testing::ElementsAre; TEST_F(UniLibTest, CharacterClassesAscii) { EXPECT_TRUE(unilib_->IsOpeningBracket('(')); EXPECT_TRUE(unilib_->IsClosingBracket(')')); EXPECT_FALSE(unilib_->IsWhitespace(')')); EXPECT_TRUE(unilib_->IsWhitespace(' ')); EXPECT_FALSE(unilib_->IsDigit(')')); EXPECT_TRUE(unilib_->IsDigit('0')); EXPECT_TRUE(unilib_->IsDigit('9')); EXPECT_FALSE(unilib_->IsUpper(')')); EXPECT_TRUE(unilib_->IsUpper('A')); EXPECT_TRUE(unilib_->IsUpper('Z')); EXPECT_FALSE(unilib_->IsLower(')')); EXPECT_TRUE(unilib_->IsLower('a')); EXPECT_TRUE(unilib_->IsLower('z')); EXPECT_TRUE(unilib_->IsPunctuation('!')); EXPECT_TRUE(unilib_->IsPunctuation('?')); EXPECT_TRUE(unilib_->IsPunctuation('#')); EXPECT_TRUE(unilib_->IsPunctuation('(')); EXPECT_FALSE(unilib_->IsPunctuation('0')); EXPECT_FALSE(unilib_->IsPunctuation('$')); EXPECT_TRUE(unilib_->IsPercentage('%')); EXPECT_TRUE(unilib_->IsPercentage(u'%')); EXPECT_TRUE(unilib_->IsSlash('/')); EXPECT_TRUE(unilib_->IsSlash(u'/')); EXPECT_TRUE(unilib_->IsMinus('-')); EXPECT_TRUE(unilib_->IsMinus(u'-')); EXPECT_TRUE(unilib_->IsNumberSign('#')); EXPECT_TRUE(unilib_->IsNumberSign(u'#')); EXPECT_TRUE(unilib_->IsDot('.')); EXPECT_TRUE(unilib_->IsDot(u'.')); EXPECT_TRUE(unilib_->IsApostrophe('\'')); EXPECT_TRUE(unilib_->IsApostrophe(u'ߴ')); EXPECT_TRUE(unilib_->IsQuotation(u'"')); EXPECT_TRUE(unilib_->IsQuotation(u'”')); EXPECT_TRUE(unilib_->IsAmpersand(u'&')); EXPECT_TRUE(unilib_->IsAmpersand(u'﹠')); EXPECT_TRUE(unilib_->IsAmpersand(u'&')); EXPECT_TRUE(unilib_->IsLatinLetter('A')); EXPECT_TRUE(unilib_->IsArabicLetter(u'ب')); // ARABIC LETTER BEH EXPECT_TRUE( unilib_->IsCyrillicLetter(u'ᲀ')); // CYRILLIC SMALL LETTER ROUNDED VE EXPECT_TRUE(unilib_->IsChineseLetter(u'豈')); // CJK COMPATIBILITY IDEOGRAPH EXPECT_TRUE(unilib_->IsJapaneseLetter(u'ぁ')); // HIRAGANA LETTER SMALL A EXPECT_TRUE(unilib_->IsKoreanLetter(u'ㄱ')); // HANGUL LETTER KIYEOK EXPECT_TRUE(unilib_->IsThaiLetter(u'ก')); // THAI CHARACTER KO KAI EXPECT_TRUE(unilib_->IsCJTletter(u'ก')); // THAI CHARACTER KO KAI EXPECT_FALSE(unilib_->IsCJTletter('A')); EXPECT_TRUE(unilib_->IsLetter('A')); EXPECT_TRUE(unilib_->IsLetter(u'A')); EXPECT_TRUE(unilib_->IsLetter(u'ト')); // KATAKANA LETTER TO EXPECT_TRUE(unilib_->IsLetter(u'ト')); // HALFWIDTH KATAKANA LETTER TO EXPECT_TRUE(unilib_->IsLetter(u'豈')); // CJK COMPATIBILITY IDEOGRAPH EXPECT_EQ(unilib_->ToLower('A'), 'a'); EXPECT_EQ(unilib_->ToLower('Z'), 'z'); EXPECT_EQ(unilib_->ToLower(')'), ')'); EXPECT_EQ(unilib_->ToLowerText(UTF8ToUnicodeText("Never gonna give you up.")) .ToUTF8String(), "never gonna give you up."); EXPECT_EQ(unilib_->ToUpper('a'), 'A'); EXPECT_EQ(unilib_->ToUpper('z'), 'Z'); EXPECT_EQ(unilib_->ToUpper(')'), ')'); EXPECT_EQ(unilib_->ToUpperText(UTF8ToUnicodeText("Never gonna let you down.")) .ToUTF8String(), "NEVER GONNA LET YOU DOWN."); EXPECT_EQ(unilib_->GetPairedBracket(')'), '('); EXPECT_EQ(unilib_->GetPairedBracket('}'), '{'); } TEST_F(UniLibTest, CharacterClassesUnicode) { EXPECT_TRUE(unilib_->IsOpeningBracket(0x0F3C)); // TIBET ANG KHANG GYON EXPECT_TRUE(unilib_->IsClosingBracket(0x0F3D)); // TIBET ANG KHANG GYAS EXPECT_FALSE(unilib_->IsWhitespace(0x23F0)); // ALARM CLOCK EXPECT_TRUE(unilib_->IsWhitespace(0x2003)); // EM SPACE EXPECT_FALSE(unilib_->IsDigit(0xA619)); // VAI SYMBOL JONG EXPECT_TRUE(unilib_->IsDigit(0xA620)); // VAI DIGIT ZERO EXPECT_TRUE(unilib_->IsDigit(0xA629)); // VAI DIGIT NINE EXPECT_FALSE(unilib_->IsDigit(0xA62A)); // VAI SYLLABLE NDOLE MA EXPECT_FALSE(unilib_->IsUpper(0x0211)); // SMALL R WITH DOUBLE GRAVE EXPECT_TRUE(unilib_->IsUpper(0x0212)); // CAPITAL R WITH DOUBLE GRAVE EXPECT_TRUE(unilib_->IsUpper(0x0391)); // GREEK CAPITAL ALPHA EXPECT_TRUE(unilib_->IsUpper(0x03AB)); // GREEK CAPITAL UPSILON W DIAL EXPECT_FALSE(unilib_->IsUpper(0x03AC)); // GREEK SMALL ALPHA WITH TONOS EXPECT_TRUE(unilib_->IsLower(0x03AC)); // GREEK SMALL ALPHA WITH TONOS EXPECT_TRUE(unilib_->IsLower(0x03B1)); // GREEK SMALL ALPHA EXPECT_TRUE(unilib_->IsLower(0x03CB)); // GREEK SMALL UPSILON EXPECT_TRUE(unilib_->IsLower(0x0211)); // SMALL R WITH DOUBLE GRAVE EXPECT_TRUE(unilib_->IsLower(0x03C0)); // GREEK SMALL PI EXPECT_TRUE(unilib_->IsLower(0x007A)); // SMALL Z EXPECT_FALSE(unilib_->IsLower(0x005A)); // CAPITAL Z EXPECT_FALSE(unilib_->IsLower(0x0212)); // CAPITAL R WITH DOUBLE GRAVE EXPECT_FALSE(unilib_->IsLower(0x0391)); // GREEK CAPITAL ALPHA EXPECT_TRUE(unilib_->IsPunctuation(0x055E)); // ARMENIAN QUESTION MARK EXPECT_TRUE(unilib_->IsPunctuation(0x066C)); // ARABIC THOUSANDS SEPARATOR EXPECT_TRUE(unilib_->IsPunctuation(0x07F7)); // NKO SYMBOL GBAKURUNEN EXPECT_TRUE(unilib_->IsPunctuation(0x10AF2)); // DOUBLE DOT WITHIN DOT EXPECT_FALSE(unilib_->IsPunctuation(0x00A3)); // POUND SIGN EXPECT_FALSE(unilib_->IsPunctuation(0xA838)); // NORTH INDIC RUPEE MARK EXPECT_TRUE(unilib_->IsPercentage(0x0025)); // PERCENT SIGN EXPECT_TRUE(unilib_->IsPercentage(0xFF05)); // FULLWIDTH PERCENT SIGN EXPECT_TRUE(unilib_->IsSlash(0x002F)); // SOLIDUS EXPECT_TRUE(unilib_->IsSlash(0xFF0F)); // FULLWIDTH SOLIDUS EXPECT_TRUE(unilib_->IsMinus(0x002D)); // HYPHEN-MINUS EXPECT_TRUE(unilib_->IsMinus(0xFF0D)); // FULLWIDTH HYPHEN-MINUS EXPECT_TRUE(unilib_->IsNumberSign(0x0023)); // NUMBER SIGN EXPECT_TRUE(unilib_->IsNumberSign(0xFF03)); // FULLWIDTH NUMBER SIGN EXPECT_TRUE(unilib_->IsDot(0x002E)); // FULL STOP EXPECT_TRUE(unilib_->IsDot(0xFF0E)); // FULLWIDTH FULL STOP EXPECT_TRUE(unilib_->IsLatinLetter(0x0041)); // LATIN CAPITAL LETTER A EXPECT_TRUE(unilib_->IsArabicLetter(0x0628)); // ARABIC LETTER BEH EXPECT_TRUE( unilib_->IsCyrillicLetter(0x1C80)); // CYRILLIC SMALL LETTER ROUNDED VE EXPECT_TRUE(unilib_->IsChineseLetter(0xF900)); // CJK COMPATIBILITY IDEOGRAPH EXPECT_TRUE(unilib_->IsJapaneseLetter(0x3041)); // HIRAGANA LETTER SMALL A EXPECT_TRUE(unilib_->IsKoreanLetter(0x3131)); // HANGUL LETTER KIYEOK EXPECT_TRUE(unilib_->IsThaiLetter(0x0E01)); // THAI CHARACTER KO KAI EXPECT_TRUE(unilib_->IsCJTletter(0x0E01)); // THAI CHARACTER KO KAI EXPECT_FALSE(unilib_->IsCJTletter(0x0041)); // LATIN CAPITAL LETTER A EXPECT_TRUE(unilib_->IsLetter(0x0041)); // LATIN CAPITAL LETTER A EXPECT_TRUE(unilib_->IsLetter(0xFF21)); // FULLWIDTH LATIN CAPITAL LETTER A EXPECT_TRUE(unilib_->IsLetter(0x30C8)); // KATAKANA LETTER TO EXPECT_TRUE(unilib_->IsLetter(0xFF84)); // HALFWIDTH KATAKANA LETTER TO EXPECT_TRUE(unilib_->IsLetter(0xF900)); // CJK COMPATIBILITY IDEOGRAPH EXPECT_EQ(unilib_->ToLower(0x0391), 0x03B1); // GREEK ALPHA EXPECT_EQ(unilib_->ToLower(0x03AB), 0x03CB); // GREEK UPSILON WITH DIALYTIKA EXPECT_EQ(unilib_->ToLower(0x03C0), 0x03C0); // GREEK SMALL PI EXPECT_EQ(unilib_->ToLower(0x03A3), 0x03C3); // GREEK CAPITAL LETTER SIGMA EXPECT_EQ( unilib_->ToLowerText(UTF8ToUnicodeText("Κανένας άνθρωπος δεν ξέρει")) .ToUTF8String(), "κανένας άνθρωπος δεν ξέρει"); EXPECT_TRUE(unilib_->IsLowerText(UTF8ToUnicodeText("ξέρει"))); EXPECT_EQ(unilib_->ToUpper(0x03B1), 0x0391); // GREEK ALPHA EXPECT_EQ(unilib_->ToUpper(0x03CB), 0x03AB); // GREEK UPSILON WITH DIALYTIKA EXPECT_EQ(unilib_->ToUpper(0x0391), 0x0391); // GREEK CAPITAL ALPHA EXPECT_EQ(unilib_->ToUpper(0x03C3), 0x03A3); // GREEK CAPITAL LETTER SIGMA EXPECT_EQ(unilib_->ToUpper(0x03C2), 0x03A3); // GREEK CAPITAL LETTER SIGMA EXPECT_EQ( unilib_->ToUpperText(UTF8ToUnicodeText("Κανένας άνθρωπος δεν ξέρει")) .ToUTF8String(), "ΚΑΝΈΝΑΣ ΆΝΘΡΩΠΟΣ ΔΕΝ ΞΈΡΕΙ"); EXPECT_TRUE(unilib_->IsUpperText(UTF8ToUnicodeText("ΚΑΝΈΝΑΣ"))); EXPECT_EQ(unilib_->GetPairedBracket(0x0F3C), 0x0F3D); EXPECT_EQ(unilib_->GetPairedBracket(0x0F3D), 0x0F3C); } TEST_F(UniLibTest, RegexInterface) { const UnicodeText regex_pattern = UTF8ToUnicodeText("[0-9]+", /*do_copy=*/true); std::unique_ptr pattern = unilib_->CreateRegexPattern(regex_pattern); const UnicodeText input = UTF8ToUnicodeText("hello 0123", /*do_copy=*/false); int status; std::unique_ptr matcher = pattern->Matcher(input); TC3_LOG(INFO) << matcher->Matches(&status); TC3_LOG(INFO) << matcher->Find(&status); TC3_LOG(INFO) << matcher->Start(0, &status); TC3_LOG(INFO) << matcher->End(0, &status); TC3_LOG(INFO) << matcher->Group(0, &status).size_codepoints(); } TEST_F(UniLibTest, Regex) { // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to // test the regex functionality with it to verify we are handling the indices // correctly. const UnicodeText regex_pattern = UTF8ToUnicodeText("[0-9]+😋", /*do_copy=*/false); std::unique_ptr pattern = unilib_->CreateRegexPattern(regex_pattern); int status; std::unique_ptr matcher; matcher = pattern->Matcher(UTF8ToUnicodeText("0123😋", /*do_copy=*/false)); EXPECT_TRUE(matcher->Matches(&status)); EXPECT_TRUE(matcher->ApproximatelyMatches(&status)); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_TRUE(matcher->Matches(&status)); // Check that the state is reset. EXPECT_TRUE(matcher->ApproximatelyMatches(&status)); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); matcher = pattern->Matcher( UTF8ToUnicodeText("hello😋😋 0123😋 world", /*do_copy=*/false)); EXPECT_FALSE(matcher->Matches(&status)); EXPECT_FALSE(matcher->ApproximatelyMatches(&status)); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); matcher = pattern->Matcher( UTF8ToUnicodeText("hello😋😋 0123😋 world", /*do_copy=*/false)); EXPECT_TRUE(matcher->Find(&status)); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->Start(0, &status), 8); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->End(0, &status), 13); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123😋"); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); } TEST_F(UniLibTest, RegexLazy) { std::unique_ptr pattern = unilib_->CreateLazyRegexPattern( UTF8ToUnicodeText("[a-z][0-9]", /*do_copy=*/false)); int status; std::unique_ptr matcher; matcher = pattern->Matcher(UTF8ToUnicodeText("a3", /*do_copy=*/false)); EXPECT_TRUE(matcher->Matches(&status)); EXPECT_TRUE(matcher->ApproximatelyMatches(&status)); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_TRUE(matcher->Matches(&status)); // Check that the state is reset. EXPECT_TRUE(matcher->ApproximatelyMatches(&status)); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); matcher = pattern->Matcher(UTF8ToUnicodeText("3a", /*do_copy=*/false)); EXPECT_FALSE(matcher->Matches(&status)); EXPECT_FALSE(matcher->ApproximatelyMatches(&status)); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); } TEST_F(UniLibTest, RegexGroups) { // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to // test the regex functionality with it to verify we are handling the indices // correctly. const UnicodeText regex_pattern = UTF8ToUnicodeText("([0-9])([0-9]+)😋", /*do_copy=*/false); std::unique_ptr pattern = unilib_->CreateRegexPattern(regex_pattern); int status; std::unique_ptr matcher; matcher = pattern->Matcher( UTF8ToUnicodeText("hello😋😋 0123😋 world", /*do_copy=*/false)); EXPECT_TRUE(matcher->Find(&status)); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->Start(0, &status), 8); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->Start(1, &status), 8); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->Start(2, &status), 9); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->End(0, &status), 13); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->End(1, &status), 9); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->End(2, &status), 12); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123😋"); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), "0"); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->Group(2, &status).ToUTF8String(), "123"); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); } TEST_F(UniLibTest, RegexGroupsNotAllGroupsInvolved) { const UnicodeText regex_pattern = UTF8ToUnicodeText("([0-9])([a-z])?", /*do_copy=*/false); std::unique_ptr pattern = unilib_->CreateRegexPattern(regex_pattern); int status; std::unique_ptr matcher; matcher = pattern->Matcher(UTF8ToUnicodeText("7", /*do_copy=*/false)); EXPECT_TRUE(matcher->Find(&status)); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "7"); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), "7"); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->Group(2, &status).ToUTF8String(), ""); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); } TEST_F(UniLibTest, RegexGroupsEmptyResult) { const UnicodeText regex_pattern = UTF8ToUnicodeText("(.*)", /*do_copy=*/false); std::unique_ptr pattern = unilib_->CreateRegexPattern(regex_pattern); int status; std::unique_ptr matcher; matcher = pattern->Matcher(UTF8ToUnicodeText("", /*do_copy=*/false)); EXPECT_TRUE(matcher->Find(&status)); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), ""); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), ""); EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); } TEST_F(UniLibTest, BreakIterator) { const UnicodeText text = UTF8ToUnicodeText("some text", /*do_copy=*/false); std::unique_ptr iterator = unilib_->CreateBreakIterator(text); std::vector break_indices; int break_index = 0; while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) { break_indices.push_back(break_index); } EXPECT_THAT(break_indices, ElementsAre(4, 5, 9)); } TEST_F(UniLibTest, BreakIterator4ByteUTF8) { const UnicodeText text = UTF8ToUnicodeText("😀😂😋", /*do_copy=*/false); std::unique_ptr iterator = unilib_->CreateBreakIterator(text); std::vector break_indices; int break_index = 0; while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) { break_indices.push_back(break_index); } EXPECT_THAT(break_indices, ElementsAre(1, 2, 3)); } TEST_F(UniLibTest, Integer32Parse) { int result; EXPECT_TRUE(unilib_->ParseInt32(UTF8ToUnicodeText("123", /*do_copy=*/false), &result)); EXPECT_EQ(result, 123); } TEST_F(UniLibTest, Integer32ParseFloatNumber) { int result; EXPECT_FALSE(unilib_->ParseInt32(UTF8ToUnicodeText("12.3", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, Integer32ParseLongNumber) { int32 result; EXPECT_TRUE(unilib_->ParseInt32( UTF8ToUnicodeText("1000000000", /*do_copy=*/false), &result)); EXPECT_EQ(result, 1000000000); } TEST_F(UniLibTest, Integer32ParseOverflowNumber) { int32 result; EXPECT_FALSE(unilib_->ParseInt32( UTF8ToUnicodeText("9123456789", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, Integer32ParseEmptyString) { int result; EXPECT_FALSE( unilib_->ParseInt32(UTF8ToUnicodeText("", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, Integer32ParseFullWidth) { int result; // The input string here is full width EXPECT_TRUE(unilib_->ParseInt32( UTF8ToUnicodeText("123", /*do_copy=*/false), &result)); EXPECT_EQ(result, 123); } TEST_F(UniLibTest, Integer32ParseNotNumber) { int result; // The input string here is full width EXPECT_FALSE(unilib_->ParseInt32( UTF8ToUnicodeText("1a3", /*do_copy=*/false), &result)); // Strings starting with "nan" are not numbers. EXPECT_FALSE(unilib_->ParseInt32(UTF8ToUnicodeText("Nancy", /*do_copy=*/false), &result)); // Strings starting with "inf" are not numbers EXPECT_FALSE(unilib_->ParseInt32( UTF8ToUnicodeText("Information", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, Integer64Parse) { int64 result; EXPECT_TRUE(unilib_->ParseInt64(UTF8ToUnicodeText("123", /*do_copy=*/false), &result)); EXPECT_EQ(result, 123); } TEST_F(UniLibTest, Integer64ParseFloatNumber) { int64 result; EXPECT_FALSE(unilib_->ParseInt64(UTF8ToUnicodeText("12.3", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, Integer64ParseLongNumber) { int64 result; // The limitation comes from the javaicu implementation: parseDouble does not // have ICU support and parseInt limit the size of the number. EXPECT_TRUE(unilib_->ParseInt64( UTF8ToUnicodeText("1000000000", /*do_copy=*/false), &result)); EXPECT_EQ(result, 1000000000); } TEST_F(UniLibTest, Integer64ParseOverflowNumber) { int64 result; EXPECT_FALSE(unilib_->ParseInt64( UTF8ToUnicodeText("92233720368547758099", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, Integer64ParseOverflowNegativeNumber) { int64 result; EXPECT_FALSE(unilib_->ParseInt64( UTF8ToUnicodeText("-92233720368547758099", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, Integer64ParseEmptyString) { int64 result; EXPECT_FALSE( unilib_->ParseInt64(UTF8ToUnicodeText("", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, Integer64ParseFullWidth) { int64 result; // The input string here is full width EXPECT_TRUE(unilib_->ParseInt64( UTF8ToUnicodeText("123", /*do_copy=*/false), &result)); EXPECT_EQ(result, 123); } TEST_F(UniLibTest, Integer64ParseNotNumber) { int64 result; // The input string here is full width EXPECT_FALSE(unilib_->ParseInt64( UTF8ToUnicodeText("1a4", /*do_copy=*/false), &result)); // Strings starting with "nan" are not numbers. EXPECT_FALSE(unilib_->ParseInt64(UTF8ToUnicodeText("Nancy", /*do_copy=*/false), &result)); // Strings starting with "inf" are not numbers EXPECT_FALSE(unilib_->ParseInt64( UTF8ToUnicodeText("Information", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, DoubleParse) { double result; EXPECT_TRUE(unilib_->ParseDouble(UTF8ToUnicodeText("1.23", /*do_copy=*/false), &result)); EXPECT_EQ(result, 1.23); } TEST_F(UniLibTest, DoubleParseLongNumber) { double result; // The limitation comes from the javaicu implementation: parseDouble does not // have ICU support and parseInt limit the size of the number. EXPECT_TRUE(unilib_->ParseDouble( UTF8ToUnicodeText("999999999.999999999", /*do_copy=*/false), &result)); EXPECT_EQ(result, 999999999.999999999); } TEST_F(UniLibTest, DoubleParseWithoutFractionalPart) { double result; EXPECT_TRUE(unilib_->ParseDouble(UTF8ToUnicodeText("123", /*do_copy=*/false), &result)); EXPECT_EQ(result, 123); } TEST_F(UniLibTest, DoubleParseEmptyString) { double result; EXPECT_FALSE( unilib_->ParseDouble(UTF8ToUnicodeText("", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, DoubleParsePrecedingDot) { double result; EXPECT_FALSE(unilib_->ParseDouble( UTF8ToUnicodeText(".123", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, DoubleParseLeadingDot) { double result; EXPECT_FALSE(unilib_->ParseDouble( UTF8ToUnicodeText("123.", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, DoubleParseMultipleDots) { double result; EXPECT_FALSE(unilib_->ParseDouble( UTF8ToUnicodeText("1.2.3", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, DoubleParseFullWidth) { double result; // The input string here is full width EXPECT_TRUE(unilib_->ParseDouble( UTF8ToUnicodeText("1.23", /*do_copy=*/false), &result)); EXPECT_EQ(result, 1.23); } TEST_F(UniLibTest, DoubleParseNotNumber) { double result; // The input string here is full width EXPECT_FALSE(unilib_->ParseDouble( UTF8ToUnicodeText("1a5", /*do_copy=*/false), &result)); // Strings starting with "nan" are not numbers. EXPECT_FALSE(unilib_->ParseDouble( UTF8ToUnicodeText("Nancy", /*do_copy=*/false), &result)); // Strings starting with "inf" are not numbers EXPECT_FALSE(unilib_->ParseDouble( UTF8ToUnicodeText("Information", /*do_copy=*/false), &result)); } TEST_F(UniLibTest, Length) { EXPECT_EQ(unilib_->Length(UTF8ToUnicodeText("hello", /*do_copy=*/false)) .ValueOrDie(), 5); EXPECT_EQ(unilib_->Length(UTF8ToUnicodeText("ěščřž", /*do_copy=*/false)) .ValueOrDie(), 5); // Test Invalid UTF8. // This testing condition needs to be != 1, as Apple character counting seems // to return 0 when the input is invalid UTF8, while ICU will treat the // invalid codepoint as 3 separate bytes. EXPECT_NE( unilib_->Length(UTF8ToUnicodeText("\xed\xa0\x80", /*do_copy=*/false)) .ValueOrDie(), 1); } } // namespace test_internal } // namespace libtextclassifier3