You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
357 lines
14 KiB
357 lines
14 KiB
/*
|
|
* Copyright (C) 2016 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
|
|
#define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
|
|
|
|
#include "base/scoped_arena_allocator.h"
|
|
#include "base/scoped_arena_containers.h"
|
|
#include "induction_var_range.h"
|
|
#include "loop_analysis.h"
|
|
#include "nodes.h"
|
|
#include "optimization.h"
|
|
#include "superblock_cloner.h"
|
|
|
|
namespace art {
|
|
|
|
class CompilerOptions;
|
|
class ArchNoOptsLoopHelper;
|
|
|
|
/**
|
|
* Loop optimizations. Builds a loop hierarchy and applies optimizations to
|
|
* the detected nested loops, such as removal of dead induction and empty loops
|
|
* and inner loop vectorization.
|
|
*/
|
|
class HLoopOptimization : public HOptimization {
|
|
public:
|
|
HLoopOptimization(HGraph* graph,
|
|
const CodeGenerator& codegen, // Needs info about the target.
|
|
HInductionVarAnalysis* induction_analysis,
|
|
OptimizingCompilerStats* stats,
|
|
const char* name = kLoopOptimizationPassName);
|
|
|
|
bool Run() override;
|
|
|
|
static constexpr const char* kLoopOptimizationPassName = "loop_optimization";
|
|
|
|
private:
|
|
/**
|
|
* A single loop inside the loop hierarchy representation.
|
|
*/
|
|
struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> {
|
|
explicit LoopNode(HLoopInformation* lp_info)
|
|
: loop_info(lp_info),
|
|
outer(nullptr),
|
|
inner(nullptr),
|
|
previous(nullptr),
|
|
next(nullptr) {}
|
|
HLoopInformation* loop_info;
|
|
LoopNode* outer;
|
|
LoopNode* inner;
|
|
LoopNode* previous;
|
|
LoopNode* next;
|
|
};
|
|
|
|
/*
|
|
* Vectorization restrictions (bit mask).
|
|
*/
|
|
enum VectorRestrictions {
|
|
kNone = 0, // no restrictions
|
|
kNoMul = 1 << 0, // no multiplication
|
|
kNoDiv = 1 << 1, // no division
|
|
kNoShift = 1 << 2, // no shift
|
|
kNoShr = 1 << 3, // no arithmetic shift right
|
|
kNoHiBits = 1 << 4, // "wider" operations cannot bring in higher order bits
|
|
kNoSignedHAdd = 1 << 5, // no signed halving add
|
|
kNoUnsignedHAdd = 1 << 6, // no unsigned halving add
|
|
kNoUnroundedHAdd = 1 << 7, // no unrounded halving add
|
|
kNoAbs = 1 << 8, // no absolute value
|
|
kNoStringCharAt = 1 << 9, // no StringCharAt
|
|
kNoReduction = 1 << 10, // no reduction
|
|
kNoSAD = 1 << 11, // no sum of absolute differences (SAD)
|
|
kNoWideSAD = 1 << 12, // no sum of absolute differences (SAD) with operand widening
|
|
kNoDotProd = 1 << 13, // no dot product
|
|
};
|
|
|
|
/*
|
|
* Vectorization mode during synthesis
|
|
* (sequential peeling/cleanup loop or vector loop).
|
|
*/
|
|
enum VectorMode {
|
|
kSequential,
|
|
kVector
|
|
};
|
|
|
|
/*
|
|
* Representation of a unit-stride array reference.
|
|
*/
|
|
struct ArrayReference {
|
|
ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false)
|
|
: base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { }
|
|
bool operator<(const ArrayReference& other) const {
|
|
return
|
|
(base < other.base) ||
|
|
(base == other.base &&
|
|
(offset < other.offset || (offset == other.offset &&
|
|
(type < other.type ||
|
|
(type == other.type &&
|
|
(lhs < other.lhs ||
|
|
(lhs == other.lhs &&
|
|
is_string_char_at < other.is_string_char_at)))))));
|
|
}
|
|
HInstruction* base; // base address
|
|
HInstruction* offset; // offset + i
|
|
DataType::Type type; // component type
|
|
bool lhs; // def/use
|
|
bool is_string_char_at; // compressed string read
|
|
};
|
|
|
|
//
|
|
// Loop setup and traversal.
|
|
//
|
|
|
|
bool LocalRun();
|
|
void AddLoop(HLoopInformation* loop_info);
|
|
void RemoveLoop(LoopNode* node);
|
|
|
|
// Traverses all loops inner to outer to perform simplifications and optimizations.
|
|
// Returns true if loops nested inside current loop (node) have changed.
|
|
bool TraverseLoopsInnerToOuter(LoopNode* node);
|
|
|
|
//
|
|
// Optimization.
|
|
//
|
|
|
|
void SimplifyInduction(LoopNode* node);
|
|
void SimplifyBlocks(LoopNode* node);
|
|
|
|
// Performs optimizations specific to inner loop with finite header logic (empty loop removal,
|
|
// unrolling, vectorization). Returns true if anything changed.
|
|
bool TryOptimizeInnerLoopFinite(LoopNode* node);
|
|
|
|
// Performs optimizations specific to inner loop. Returns true if anything changed.
|
|
bool OptimizeInnerLoop(LoopNode* node);
|
|
|
|
// Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling
|
|
// opportunities. Returns whether transformation happened. 'generate_code' determines whether the
|
|
// optimization should be actually applied.
|
|
bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info,
|
|
bool generate_code = true);
|
|
|
|
// Tries to apply loop peeling for loop invariant exits elimination. Returns whether
|
|
// transformation happened. 'generate_code' determines whether the optimization should be
|
|
// actually applied.
|
|
bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info,
|
|
bool generate_code = true);
|
|
|
|
// Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate
|
|
// the loop check overhead and to have more opportunities for inter-iteration optimizations.
|
|
// Returns whether transformation happened. 'generate_code' determines whether the optimization
|
|
// should be actually applied.
|
|
bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true);
|
|
|
|
// Tries to apply scalar loop peeling and unrolling.
|
|
bool TryPeelingAndUnrolling(LoopNode* node);
|
|
|
|
//
|
|
// Vectorization analysis and synthesis.
|
|
//
|
|
|
|
bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
|
|
void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
|
|
void GenerateNewLoop(LoopNode* node,
|
|
HBasicBlock* block,
|
|
HBasicBlock* new_preheader,
|
|
HInstruction* lo,
|
|
HInstruction* hi,
|
|
HInstruction* step,
|
|
uint32_t unroll);
|
|
bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
|
|
bool VectorizeUse(LoopNode* node,
|
|
HInstruction* instruction,
|
|
bool generate_code,
|
|
DataType::Type type,
|
|
uint64_t restrictions);
|
|
uint32_t GetVectorSizeInBytes();
|
|
bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions);
|
|
bool TrySetVectorLengthImpl(uint32_t length);
|
|
|
|
bool TrySetVectorLength(DataType::Type type, uint32_t length) {
|
|
bool res = TrySetVectorLengthImpl(length);
|
|
// Currently the vectorizer supports only the mode when full SIMD registers are used.
|
|
DCHECK(!res || (DataType::Size(type) * length == GetVectorSizeInBytes()));
|
|
return res;
|
|
}
|
|
|
|
void GenerateVecInv(HInstruction* org, DataType::Type type);
|
|
void GenerateVecSub(HInstruction* org, HInstruction* offset);
|
|
void GenerateVecMem(HInstruction* org,
|
|
HInstruction* opa,
|
|
HInstruction* opb,
|
|
HInstruction* offset,
|
|
DataType::Type type);
|
|
void GenerateVecReductionPhi(HPhi* phi);
|
|
void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
|
|
HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
|
|
void GenerateVecOp(HInstruction* org,
|
|
HInstruction* opa,
|
|
HInstruction* opb,
|
|
DataType::Type type);
|
|
|
|
// Vectorization idioms.
|
|
bool VectorizeSaturationIdiom(LoopNode* node,
|
|
HInstruction* instruction,
|
|
bool generate_code,
|
|
DataType::Type type,
|
|
uint64_t restrictions);
|
|
bool VectorizeHalvingAddIdiom(LoopNode* node,
|
|
HInstruction* instruction,
|
|
bool generate_code,
|
|
DataType::Type type,
|
|
uint64_t restrictions);
|
|
bool VectorizeSADIdiom(LoopNode* node,
|
|
HInstruction* instruction,
|
|
bool generate_code,
|
|
DataType::Type type,
|
|
uint64_t restrictions);
|
|
bool VectorizeDotProdIdiom(LoopNode* node,
|
|
HInstruction* instruction,
|
|
bool generate_code,
|
|
DataType::Type type,
|
|
uint64_t restrictions);
|
|
|
|
// Vectorization heuristics.
|
|
Alignment ComputeAlignment(HInstruction* offset,
|
|
DataType::Type type,
|
|
bool is_string_char_at,
|
|
uint32_t peeling = 0);
|
|
void SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes,
|
|
const ArrayReference* peeling_candidate);
|
|
uint32_t MaxNumberPeeled();
|
|
bool IsVectorizationProfitable(int64_t trip_count);
|
|
|
|
//
|
|
// Helpers.
|
|
//
|
|
|
|
bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
|
|
bool TrySetPhiReduction(HPhi* phi);
|
|
|
|
// Detects loop header with a single induction (returned in main_phi), possibly
|
|
// other phis for reductions, but no other side effects. Returns true on success.
|
|
bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi);
|
|
|
|
bool IsEmptyBody(HBasicBlock* block);
|
|
bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
|
|
HInstruction* instruction,
|
|
bool collect_loop_uses,
|
|
/*out*/ uint32_t* use_count);
|
|
bool IsUsedOutsideLoop(HLoopInformation* loop_info,
|
|
HInstruction* instruction);
|
|
bool TryReplaceWithLastValue(HLoopInformation* loop_info,
|
|
HInstruction* instruction,
|
|
HBasicBlock* block);
|
|
bool TryAssignLastValue(HLoopInformation* loop_info,
|
|
HInstruction* instruction,
|
|
HBasicBlock* block,
|
|
bool collect_loop_uses);
|
|
void RemoveDeadInstructions(const HInstructionList& list);
|
|
bool CanRemoveCycle(); // Whether the current 'iset_' is removable.
|
|
|
|
bool IsInPredicatedVectorizationMode() const { return predicated_vectorization_mode_; }
|
|
|
|
// Compiler options (to query ISA features).
|
|
const CompilerOptions* compiler_options_;
|
|
|
|
// Cached target SIMD vector register size in bytes.
|
|
const size_t simd_register_size_;
|
|
|
|
// Range information based on prior induction variable analysis.
|
|
InductionVarRange induction_range_;
|
|
|
|
// Phase-local heap memory allocator for the loop optimizer. Storage obtained
|
|
// through this allocator is immediately released when the loop optimizer is done.
|
|
ScopedArenaAllocator* loop_allocator_;
|
|
|
|
// Global heap memory allocator. Used to build HIR.
|
|
ArenaAllocator* global_allocator_;
|
|
|
|
// Entries into the loop hierarchy representation. The hierarchy resides
|
|
// in phase-local heap memory.
|
|
LoopNode* top_loop_;
|
|
LoopNode* last_loop_;
|
|
|
|
// Temporary bookkeeping of a set of instructions.
|
|
// Contents reside in phase-local heap memory.
|
|
ScopedArenaSet<HInstruction*>* iset_;
|
|
|
|
// Temporary bookkeeping of reduction instructions. Mapping is two-fold:
|
|
// (1) reductions in the loop-body are mapped back to their phi definition,
|
|
// (2) phi definitions are mapped to their initial value (updated during
|
|
// code generation to feed the proper values into the new chain).
|
|
// Contents reside in phase-local heap memory.
|
|
ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_;
|
|
|
|
// Flag that tracks if any simplifications have occurred.
|
|
bool simplified_;
|
|
|
|
// Whether to use predicated loop vectorization (e.g. for arm64 SVE target).
|
|
bool predicated_vectorization_mode_;
|
|
|
|
// Number of "lanes" for selected packed type.
|
|
uint32_t vector_length_;
|
|
|
|
// Set of array references in the vector loop.
|
|
// Contents reside in phase-local heap memory.
|
|
ScopedArenaSet<ArrayReference>* vector_refs_;
|
|
|
|
// Static or dynamic loop peeling for alignment.
|
|
uint32_t vector_static_peeling_factor_;
|
|
const ArrayReference* vector_dynamic_peeling_candidate_;
|
|
|
|
// Dynamic data dependence test of the form a != b.
|
|
HInstruction* vector_runtime_test_a_;
|
|
HInstruction* vector_runtime_test_b_;
|
|
|
|
// Mapping used during vectorization synthesis for both the scalar peeling/cleanup
|
|
// loop (mode is kSequential) and the actual vector loop (mode is kVector). The data
|
|
// structure maps original instructions into the new instructions.
|
|
// Contents reside in phase-local heap memory.
|
|
ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
|
|
|
|
// Permanent mapping used during vectorization synthesis.
|
|
// Contents reside in phase-local heap memory.
|
|
ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_;
|
|
|
|
// Temporary vectorization bookkeeping.
|
|
VectorMode vector_mode_; // synthesis mode
|
|
HBasicBlock* vector_preheader_; // preheader of the new loop
|
|
HBasicBlock* vector_header_; // header of the new loop
|
|
HBasicBlock* vector_body_; // body of the new loop
|
|
HInstruction* vector_index_; // normalized index of the new loop
|
|
|
|
// Helper for target-specific behaviour for loop optimizations.
|
|
ArchNoOptsLoopHelper* arch_loop_helper_;
|
|
|
|
friend class LoopOptimizationTest;
|
|
|
|
DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
|
|
};
|
|
|
|
} // namespace art
|
|
|
|
#endif // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
|