You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
173 lines
5.4 KiB
173 lines
5.4 KiB
4 months ago
|
// Copyright 2017 The Chromium OS Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style license that can be
|
||
|
// found in the LICENSE file.
|
||
|
|
||
|
#include "bsdiff/suffix_array_index.h"
|
||
|
|
||
|
#include <limits>
|
||
|
#include <vector>
|
||
|
|
||
|
#include <divsufsort.h>
|
||
|
#include <divsufsort64.h>
|
||
|
|
||
|
#include "bsdiff/logging.h"
|
||
|
|
||
|
namespace {
|
||
|
|
||
|
// libdivsufsort C++ overloaded functions used to allow calling the right
|
||
|
// implementation based on the pointer size.
|
||
|
int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) {
|
||
|
return divsufsort(text, sa, n);
|
||
|
}
|
||
|
int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) {
|
||
|
return divsufsort64(text, sa, n);
|
||
|
}
|
||
|
|
||
|
saidx_t CallSaSearch(const uint8_t* text,
|
||
|
size_t text_size,
|
||
|
const uint8_t* pattern,
|
||
|
size_t pattern_size,
|
||
|
const saidx_t* sa,
|
||
|
size_t sa_size,
|
||
|
saidx_t* left) {
|
||
|
return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left);
|
||
|
}
|
||
|
saidx64_t CallSaSearch(const uint8_t* text,
|
||
|
size_t text_size,
|
||
|
const uint8_t* pattern,
|
||
|
size_t pattern_size,
|
||
|
const saidx64_t* sa,
|
||
|
size_t sa_size,
|
||
|
saidx64_t* left) {
|
||
|
return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left);
|
||
|
}
|
||
|
|
||
|
} // namespace
|
||
|
|
||
|
namespace bsdiff {
|
||
|
|
||
|
// The SAIDX template type must be either saidx_t or saidx64_t, which will
|
||
|
// depend on the maximum size of the input text needed.
|
||
|
template <typename SAIDX>
|
||
|
class SuffixArrayIndex : public SuffixArrayIndexInterface {
|
||
|
public:
|
||
|
SuffixArrayIndex() = default;
|
||
|
|
||
|
// Initialize and construct the suffix array of the |text| string of length
|
||
|
// |n|. The memory pointed by |text| must be kept alive. Returns whether the
|
||
|
// construction succeeded.
|
||
|
bool Init(const uint8_t* text, size_t n);
|
||
|
|
||
|
// SuffixArrayIndexInterface overrides.
|
||
|
void SearchPrefix(const uint8_t* target,
|
||
|
size_t length,
|
||
|
size_t* out_length,
|
||
|
uint64_t* out_pos) const override;
|
||
|
|
||
|
private:
|
||
|
const uint8_t* text_{nullptr}; // Owned by the caller.
|
||
|
size_t n_{0};
|
||
|
|
||
|
std::vector<SAIDX> sa_;
|
||
|
};
|
||
|
|
||
|
template <typename SAIDX>
|
||
|
bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) {
|
||
|
if (!sa_.empty()) {
|
||
|
// Already initialized.
|
||
|
LOG(ERROR) << "SuffixArray already initialized";
|
||
|
return false;
|
||
|
}
|
||
|
if (static_cast<uint64_t>(n) >
|
||
|
static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) {
|
||
|
LOG(ERROR) << "Input too big (" << n << ") for this implementation";
|
||
|
return false;
|
||
|
}
|
||
|
text_ = text;
|
||
|
n_ = n;
|
||
|
sa_.resize(n + 1);
|
||
|
|
||
|
if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) {
|
||
|
LOG(ERROR) << "divsufsrot() failed";
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
template <typename SAIDX>
|
||
|
void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target,
|
||
|
size_t length,
|
||
|
size_t* out_length,
|
||
|
uint64_t* out_pos) const {
|
||
|
SAIDX suf_left;
|
||
|
SAIDX count =
|
||
|
CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left);
|
||
|
if (count > 0) {
|
||
|
// This is the simple case where we found the whole |target| string was
|
||
|
// found.
|
||
|
*out_pos = sa_[suf_left];
|
||
|
*out_length = length;
|
||
|
return;
|
||
|
}
|
||
|
// In this case, |suf_left| points to the first suffix array position such
|
||
|
// that the suffix at that position is lexicographically larger than |target|.
|
||
|
// We only need to check whether the previous entry or the current entry is a
|
||
|
// longer match.
|
||
|
size_t prev_suffix_len = 0;
|
||
|
if (suf_left > 0) {
|
||
|
const size_t prev_max_len =
|
||
|
std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length);
|
||
|
const uint8_t* prev_suffix = text_ + sa_[suf_left - 1];
|
||
|
prev_suffix_len =
|
||
|
std::mismatch(target, target + prev_max_len, prev_suffix).first -
|
||
|
target;
|
||
|
}
|
||
|
|
||
|
size_t next_suffix_len = 0;
|
||
|
if (static_cast<size_t>(suf_left) < n_) {
|
||
|
const uint8_t* next_suffix = text_ + sa_[suf_left];
|
||
|
const size_t next_max_len =
|
||
|
std::min(n_ - static_cast<size_t>(sa_[suf_left]), length);
|
||
|
next_suffix_len =
|
||
|
std::mismatch(target, target + next_max_len, next_suffix).first -
|
||
|
target;
|
||
|
}
|
||
|
|
||
|
*out_length = std::max(next_suffix_len, prev_suffix_len);
|
||
|
if (!*out_length) {
|
||
|
*out_pos = 0;
|
||
|
} else if (next_suffix_len >= prev_suffix_len) {
|
||
|
*out_pos = sa_[suf_left];
|
||
|
} else {
|
||
|
*out_pos = sa_[suf_left - 1];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex(
|
||
|
const uint8_t* text,
|
||
|
size_t n) {
|
||
|
// The maximum supported size when using the suffix array based on the 32-bit
|
||
|
// saidx_t type. We limit this to something a bit smaller (16 bytes smaller)
|
||
|
// than the maximum representable number so references like "n + 1" are don't
|
||
|
// overflow.
|
||
|
const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16;
|
||
|
std::unique_ptr<SuffixArrayIndexInterface> ret;
|
||
|
|
||
|
if (n > kMaxSaidxSize) {
|
||
|
SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>();
|
||
|
ret.reset(sa_ptr);
|
||
|
if (!sa_ptr->Init(text, n))
|
||
|
return nullptr;
|
||
|
} else {
|
||
|
SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>();
|
||
|
ret.reset(sa_ptr);
|
||
|
if (!sa_ptr->Init(text, n))
|
||
|
return nullptr;
|
||
|
}
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
|
||
|
} // namespace bsdiff
|