You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
144 lines
5.4 KiB
144 lines
5.4 KiB
/*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
|
|
#define LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
|
|
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <unordered_set>
|
|
#include <vector>
|
|
|
|
#include "annotator/feature-processor.h"
|
|
#include "annotator/model_generated.h"
|
|
#include "annotator/types.h"
|
|
#include "utils/utf8/unicodetext.h"
|
|
#include "utils/utf8/unilib.h"
|
|
|
|
namespace libtextclassifier3 {
|
|
|
|
namespace internal {
|
|
enum class DurationUnit {
|
|
UNKNOWN = -1,
|
|
WEEK = 0,
|
|
DAY = 1,
|
|
HOUR = 2,
|
|
MINUTE = 3,
|
|
SECOND = 4
|
|
|
|
// NOTE: If we want to add MONTH and YEAR we'll have to think of different
|
|
// parsing format, because MONTH and YEAR don't have a fixed number of
|
|
// milliseconds, unlike week/day/hour/minute/second. We ignore the daylight
|
|
// savings time and assume the day is always 24 hours.
|
|
};
|
|
|
|
// Prepares the mapping between token values and duration unit types.
|
|
std::unordered_map<std::string, internal::DurationUnit>
|
|
BuildTokenToDurationUnitMapping(const DurationAnnotatorOptions* options,
|
|
const UniLib* unilib);
|
|
|
|
// Creates a set of strings from a flatbuffer string vector.
|
|
std::unordered_set<std::string> BuildStringSet(
|
|
const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*
|
|
strings,
|
|
const UniLib* unilib);
|
|
|
|
// Creates a set of ints from a flatbuffer int vector.
|
|
std::unordered_set<int32> BuildInt32Set(const flatbuffers::Vector<int32>* ints);
|
|
|
|
} // namespace internal
|
|
|
|
// Annotator of duration expressions like "3 minutes 30 seconds".
|
|
class DurationAnnotator {
|
|
public:
|
|
explicit DurationAnnotator(const DurationAnnotatorOptions* options,
|
|
const FeatureProcessor* feature_processor,
|
|
const UniLib* unilib)
|
|
: options_(options),
|
|
feature_processor_(feature_processor),
|
|
unilib_(unilib),
|
|
token_value_to_duration_unit_(
|
|
internal::BuildTokenToDurationUnitMapping(options, unilib)),
|
|
filler_expressions_(
|
|
internal::BuildStringSet(options->filler_expressions(), unilib)),
|
|
half_expressions_(
|
|
internal::BuildStringSet(options->half_expressions(), unilib)),
|
|
sub_token_separator_codepoints_(internal::BuildInt32Set(
|
|
options->sub_token_separator_codepoints())) {}
|
|
|
|
// Classifies given text, and if it is a duration, it passes the result in
|
|
// 'classification_result' and returns true, otherwise returns false.
|
|
bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices,
|
|
AnnotationUsecase annotation_usecase,
|
|
ClassificationResult* classification_result) const;
|
|
|
|
// Finds all duration instances in the input text.
|
|
bool FindAll(const UnicodeText& context, const std::vector<Token>& tokens,
|
|
AnnotationUsecase annotation_usecase,
|
|
std::vector<AnnotatedSpan>* results) const;
|
|
|
|
private:
|
|
// Represents a component of duration parsed from text (e.g. "3 hours" from
|
|
// the expression "3 hours and 20 minutes").
|
|
struct ParsedDurationAtom {
|
|
// Unit of the duration.
|
|
internal::DurationUnit unit = internal::DurationUnit::UNKNOWN;
|
|
|
|
// Quantity of the duration unit.
|
|
double value = 0;
|
|
|
|
// True, if half an unit was specified (either in addition, or exclusively).
|
|
// E.g. "hour and a half".
|
|
// NOTE: Quarter, three-quarters etc. is not supported.
|
|
bool plus_half = false;
|
|
|
|
static ParsedDurationAtom Half() {
|
|
ParsedDurationAtom result;
|
|
result.plus_half = true;
|
|
return result;
|
|
}
|
|
};
|
|
|
|
// Starts consuming tokens and returns the index past the last consumed token.
|
|
int FindDurationStartingAt(const UnicodeText& context,
|
|
const std::vector<Token>& tokens,
|
|
int start_token_index,
|
|
AnnotatedSpan* result) const;
|
|
|
|
bool ParseQuantityToken(const Token& token, ParsedDurationAtom* value) const;
|
|
bool ParseDurationUnitToken(const Token& token,
|
|
internal::DurationUnit* duration_unit) const;
|
|
bool ParseQuantityDurationUnitToken(const Token& token,
|
|
ParsedDurationAtom* value) const;
|
|
bool ParseFillerToken(const Token& token) const;
|
|
|
|
int64 ParsedDurationAtomsToMillis(
|
|
const std::vector<ParsedDurationAtom>& atoms) const;
|
|
|
|
const DurationAnnotatorOptions* options_;
|
|
const FeatureProcessor* feature_processor_;
|
|
const UniLib* unilib_;
|
|
const std::unordered_map<std::string, internal::DurationUnit>
|
|
token_value_to_duration_unit_;
|
|
const std::unordered_set<std::string> filler_expressions_;
|
|
const std::unordered_set<std::string> half_expressions_;
|
|
const std::unordered_set<int32> sub_token_separator_codepoints_;
|
|
};
|
|
|
|
} // namespace libtextclassifier3
|
|
|
|
#endif // LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
|