You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

124 lines
4.9 KiB

/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_
#define LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_
#include <string>
#include <unordered_map>
#include <vector>
#include "annotator/model_generated.h"
#include "annotator/types.h"
#include "utils/strings/stringpiece.h"
#include "utils/utf8/unicodetext.h"
#include "utils/utf8/unilib.h"
namespace libtextclassifier3 {
struct CompiledRule {
// The compiled regular expression.
std::unique_ptr<const UniLib::RegexPattern> compiled_regex;
// The uncompiled pattern and information about the pattern groups.
const DatetimeModelPattern_::Regex* regex;
// DatetimeModelPattern which 'regex' is part of and comes from.
const DatetimeModelPattern* pattern;
};
// A helper class for DatetimeParser that extracts structured data
// (DateParseDate) from the current match of the passed RegexMatcher.
class DatetimeExtractor {
public:
explicit DatetimeExtractor(
const CompiledRule& rule, const UniLib::RegexMatcher& matcher,
int locale_id, const UniLib* unilib,
const std::vector<std::unique_ptr<const UniLib::RegexPattern>>&
extractor_rules,
const std::unordered_map<DatetimeExtractorType,
std::unordered_map<int, int>>&
type_and_locale_to_extractor_rule)
: rule_(rule),
matcher_(matcher),
locale_id_(locale_id),
unilib_(*unilib),
rules_(extractor_rules),
type_and_locale_to_rule_(type_and_locale_to_extractor_rule) {}
bool Extract(DatetimeParsedData* result, CodepointSpan* result_span) const;
private:
bool RuleIdForType(DatetimeExtractorType type, int* rule_id) const;
// Returns true if the rule for given extractor matched. If it matched,
// match_result will contain the first group of the rule (if match_result not
// nullptr).
bool ExtractType(const UnicodeText& input,
DatetimeExtractorType extractor_type,
UnicodeText* match_result = nullptr) const;
bool GroupTextFromMatch(int group_id, UnicodeText* result) const;
// Updates the span to include the current match for the given group.
bool UpdateMatchSpan(int group_id, CodepointSpan* span) const;
// Returns true if any of the extractors from 'mapping' matched. If it did,
// will fill 'result' with the associated value from 'mapping'.
template <typename T>
bool MapInput(const UnicodeText& input,
const std::vector<std::pair<DatetimeExtractorType, T>>& mapping,
T* result) const;
bool ParseDigits(const UnicodeText& input, int* parsed_digits) const;
bool ParseWrittenNumber(const UnicodeText& input, int* parsed_number) const;
bool ParseYear(const UnicodeText& input, int* parsed_year) const;
bool ParseMonth(const UnicodeText& input, int* parsed_month) const;
bool ParseMeridiem(const UnicodeText& input, int* parsed_meridiem) const;
bool ParseRelativeValue(
const UnicodeText& input,
DatetimeComponent::RelativeQualifier* parsed_relative_value) const;
bool ParseRelationDistance(const UnicodeText& input,
int* parsed_distance) const;
bool ParseFieldType(
const UnicodeText& input,
DatetimeComponent::ComponentType* parsed_field_type) const;
bool ParseDayOfWeek(const UnicodeText& input, int* parsed_day_of_week) const;
bool ParseRelationAndConvertToRelativeCount(const UnicodeText& input,
int* relative_count) const;
// There are some special words which represent multiple date time components
// e.g. if the text says “by noon” it clearly indicates that the hour is 12,
// minute is 0 and meridiam is PM.
// The method handles such tokens and translates them into multiple date time
// components.
bool ParseAbsoluteDateValues(
const UnicodeText& input,
std::unordered_map<DatetimeComponent::ComponentType, int>* values) const;
const CompiledRule& rule_;
const UniLib::RegexMatcher& matcher_;
int locale_id_;
const UniLib& unilib_;
const std::vector<std::unique_ptr<const UniLib::RegexPattern>>& rules_;
const std::unordered_map<DatetimeExtractorType, std::unordered_map<int, int>>&
type_and_locale_to_rule_;
};
} // namespace libtextclassifier3
#endif // LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_