//===- unittests/Support/SuffixTreeTest.cpp - suffix tree tests -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "llvm/Support/SuffixTree.h" #include "gtest/gtest.h" #include using namespace llvm; namespace { // Each example vector has a unique element at the end to represent the end of // the string // Tests that The SuffixTree finds a simple repetition of the substring {1, 2} // {1, 2} twice in the provided string. TEST(SuffixTreeTest, TestSingleRepetition) { std::vector SimpleRepetitionData = {1, 2, 1, 2, 3}; SuffixTree ST(SimpleRepetitionData); std::vector SubStrings; for (auto It = ST.begin(); It != ST.end(); It++) SubStrings.push_back(*It); ASSERT_EQ(SubStrings.size(), 1u); EXPECT_EQ(SubStrings[0].Length, 2u); EXPECT_EQ(SubStrings[0].StartIndices.size(), 2u); for (unsigned StartIdx : SubStrings[0].StartIndices) { EXPECT_EQ(SimpleRepetitionData[StartIdx], 1u); EXPECT_EQ(SimpleRepetitionData[StartIdx + 1], 2u); } } // Tests that the SuffixTree is able to find the substrings {1, 2, 3} at // at indices 0 and 3 as well as the substrings {2, 3} at indices 1 and 4. // This test also serves as a flag for improvements to the suffix tree. // // FIXME: Right now, the longest repeated substring from a specific index is // returned, this could be improved to return the longest repeated substring, as // well as those that are smaller. TEST(SuffixTreeTest, TestLongerRepetition) { std::vector RepeatedRepetitionData = {1, 2, 3, 1, 2, 3, 4}; SuffixTree ST(RepeatedRepetitionData); std::vector SubStrings; for (auto It = ST.begin(); It != ST.end(); It++) SubStrings.push_back(*It); EXPECT_EQ(SubStrings.size(), 2u); unsigned Len; for (SuffixTree::RepeatedSubstring &RS : SubStrings) { Len = RS.Length; bool IsExpectedLen = (Len == 3u || Len == 2u); bool IsExpectedIndex; ASSERT_TRUE(IsExpectedLen); if (Len == 3u) { for (unsigned StartIdx : RS.StartIndices) { IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u); EXPECT_TRUE(IsExpectedIndex); EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u); EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u); EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u); } } else { for (unsigned StartIdx : RS.StartIndices) { IsExpectedIndex = (StartIdx == 1u || StartIdx == 4u); EXPECT_TRUE(IsExpectedIndex); EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u); EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u); } } } } // Tests that the SuffixTree is able to find substring {1, 1, 1, 1, 1} at // indices 0 and 1. // // FIXME: Add support for detecting {1, 1} and {1, 1, 1} TEST(SuffixTreeTest, TestSingleCharacterRepeat) { std::vector RepeatedRepetitionData = {1, 1, 1, 1, 1, 1, 2}; std::vector::iterator RRDIt, RRDIt2; SuffixTree ST(RepeatedRepetitionData); std::vector SubStrings; for (auto It = ST.begin(); It != ST.end(); It++) SubStrings.push_back(*It); EXPECT_EQ(SubStrings.size(), 1u); for (SuffixTree::RepeatedSubstring &RS : SubStrings) { EXPECT_EQ(RS.StartIndices.size(), RepeatedRepetitionData.size() - RS.Length); for (unsigned StartIdx : SubStrings[0].StartIndices) { RRDIt = RRDIt2 = RepeatedRepetitionData.begin(); std::advance(RRDIt, StartIdx); std::advance(RRDIt2, StartIdx + SubStrings[0].Length); ASSERT_TRUE( all_of(make_range::iterator>(RRDIt, RRDIt2), [](unsigned Elt) { return Elt == 1; })); } } } // The suffix tree cannot currently find repeated substrings in strings of the // form {1, 2, 3, 1, 2, 3}, because the two {1, 2, 3}s are adjacent ("tandem // repeats") // // FIXME: Teach the SuffixTree to recognize these cases. TEST(SuffixTreeTest, TestTandemRepeat) { std::vector RepeatedRepetitionData = {1, 2, 3, 1, 2, 3}; SuffixTree ST(RepeatedRepetitionData); std::vector SubStrings; for (auto It = ST.begin(); It != ST.end(); It++) SubStrings.push_back(*It); EXPECT_EQ(SubStrings.size(), 0u); } // Tests that the SuffixTree does not erroneously include values that are not // in repeated substrings. That is, only finds {1, 1} at indices 0 and 3 and // does not include 2 and 3. TEST(SuffixTreeTest, TestExclusion) { std::vector RepeatedRepetitionData = {1, 1, 2, 1, 1, 3}; std::vector::iterator RRDIt, RRDIt2; SuffixTree ST(RepeatedRepetitionData); std::vector SubStrings; for (auto It = ST.begin(); It != ST.end(); It++) SubStrings.push_back(*It); EXPECT_EQ(SubStrings.size(), 1u); bool IsExpectedIndex; for (SuffixTree::RepeatedSubstring &RS : SubStrings) { for (unsigned StartIdx : RS.StartIndices) { IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u); EXPECT_TRUE(IsExpectedIndex); RRDIt = RRDIt2 = RepeatedRepetitionData.begin(); std::advance(RRDIt, StartIdx); std::advance(RRDIt2, StartIdx + RS.Length); EXPECT_TRUE( all_of(make_range::iterator>(RRDIt, RRDIt2), [](unsigned Elt) { return Elt == 1; })); } } } } // namespace