/* * Copyright (C) 2016 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ #include "base/scoped_arena_allocator.h" #include "base/scoped_arena_containers.h" #include "induction_var_range.h" #include "loop_analysis.h" #include "nodes.h" #include "optimization.h" #include "superblock_cloner.h" namespace art { class CompilerOptions; class ArchNoOptsLoopHelper; /** * Loop optimizations. Builds a loop hierarchy and applies optimizations to * the detected nested loops, such as removal of dead induction and empty loops * and inner loop vectorization. */ class HLoopOptimization : public HOptimization { public: HLoopOptimization(HGraph* graph, const CodeGenerator& codegen, // Needs info about the target. HInductionVarAnalysis* induction_analysis, OptimizingCompilerStats* stats, const char* name = kLoopOptimizationPassName); bool Run() override; static constexpr const char* kLoopOptimizationPassName = "loop_optimization"; private: /** * A single loop inside the loop hierarchy representation. */ struct LoopNode : public ArenaObject { explicit LoopNode(HLoopInformation* lp_info) : loop_info(lp_info), outer(nullptr), inner(nullptr), previous(nullptr), next(nullptr) {} HLoopInformation* loop_info; LoopNode* outer; LoopNode* inner; LoopNode* previous; LoopNode* next; }; /* * Vectorization restrictions (bit mask). */ enum VectorRestrictions { kNone = 0, // no restrictions kNoMul = 1 << 0, // no multiplication kNoDiv = 1 << 1, // no division kNoShift = 1 << 2, // no shift kNoShr = 1 << 3, // no arithmetic shift right kNoHiBits = 1 << 4, // "wider" operations cannot bring in higher order bits kNoSignedHAdd = 1 << 5, // no signed halving add kNoUnsignedHAdd = 1 << 6, // no unsigned halving add kNoUnroundedHAdd = 1 << 7, // no unrounded halving add kNoAbs = 1 << 8, // no absolute value kNoStringCharAt = 1 << 9, // no StringCharAt kNoReduction = 1 << 10, // no reduction kNoSAD = 1 << 11, // no sum of absolute differences (SAD) kNoWideSAD = 1 << 12, // no sum of absolute differences (SAD) with operand widening kNoDotProd = 1 << 13, // no dot product }; /* * Vectorization mode during synthesis * (sequential peeling/cleanup loop or vector loop). */ enum VectorMode { kSequential, kVector }; /* * Representation of a unit-stride array reference. */ struct ArrayReference { ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false) : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { } bool operator<(const ArrayReference& other) const { return (base < other.base) || (base == other.base && (offset < other.offset || (offset == other.offset && (type < other.type || (type == other.type && (lhs < other.lhs || (lhs == other.lhs && is_string_char_at < other.is_string_char_at))))))); } HInstruction* base; // base address HInstruction* offset; // offset + i DataType::Type type; // component type bool lhs; // def/use bool is_string_char_at; // compressed string read }; // // Loop setup and traversal. // bool LocalRun(); void AddLoop(HLoopInformation* loop_info); void RemoveLoop(LoopNode* node); // Traverses all loops inner to outer to perform simplifications and optimizations. // Returns true if loops nested inside current loop (node) have changed. bool TraverseLoopsInnerToOuter(LoopNode* node); // // Optimization. // void SimplifyInduction(LoopNode* node); void SimplifyBlocks(LoopNode* node); // Performs optimizations specific to inner loop with finite header logic (empty loop removal, // unrolling, vectorization). Returns true if anything changed. bool TryOptimizeInnerLoopFinite(LoopNode* node); // Performs optimizations specific to inner loop. Returns true if anything changed. bool OptimizeInnerLoop(LoopNode* node); // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling // opportunities. Returns whether transformation happened. 'generate_code' determines whether the // optimization should be actually applied. bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info, bool generate_code = true); // Tries to apply loop peeling for loop invariant exits elimination. Returns whether // transformation happened. 'generate_code' determines whether the optimization should be // actually applied. bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info, bool generate_code = true); // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate // the loop check overhead and to have more opportunities for inter-iteration optimizations. // Returns whether transformation happened. 'generate_code' determines whether the optimization // should be actually applied. bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true); // Tries to apply scalar loop peeling and unrolling. bool TryPeelingAndUnrolling(LoopNode* node); // // Vectorization analysis and synthesis. // bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count); void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count); void GenerateNewLoop(LoopNode* node, HBasicBlock* block, HBasicBlock* new_preheader, HInstruction* lo, HInstruction* hi, HInstruction* step, uint32_t unroll); bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code); bool VectorizeUse(LoopNode* node, HInstruction* instruction, bool generate_code, DataType::Type type, uint64_t restrictions); uint32_t GetVectorSizeInBytes(); bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions); bool TrySetVectorLengthImpl(uint32_t length); bool TrySetVectorLength(DataType::Type type, uint32_t length) { bool res = TrySetVectorLengthImpl(length); // Currently the vectorizer supports only the mode when full SIMD registers are used. DCHECK(!res || (DataType::Size(type) * length == GetVectorSizeInBytes())); return res; } void GenerateVecInv(HInstruction* org, DataType::Type type); void GenerateVecSub(HInstruction* org, HInstruction* offset); void GenerateVecMem(HInstruction* org, HInstruction* opa, HInstruction* opb, HInstruction* offset, DataType::Type type); void GenerateVecReductionPhi(HPhi* phi); void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction); HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction); void GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, DataType::Type type); // Vectorization idioms. bool VectorizeSaturationIdiom(LoopNode* node, HInstruction* instruction, bool generate_code, DataType::Type type, uint64_t restrictions); bool VectorizeHalvingAddIdiom(LoopNode* node, HInstruction* instruction, bool generate_code, DataType::Type type, uint64_t restrictions); bool VectorizeSADIdiom(LoopNode* node, HInstruction* instruction, bool generate_code, DataType::Type type, uint64_t restrictions); bool VectorizeDotProdIdiom(LoopNode* node, HInstruction* instruction, bool generate_code, DataType::Type type, uint64_t restrictions); // Vectorization heuristics. Alignment ComputeAlignment(HInstruction* offset, DataType::Type type, bool is_string_char_at, uint32_t peeling = 0); void SetAlignmentStrategy(const ScopedArenaVector& peeling_votes, const ArrayReference* peeling_candidate); uint32_t MaxNumberPeeled(); bool IsVectorizationProfitable(int64_t trip_count); // // Helpers. // bool TrySetPhiInduction(HPhi* phi, bool restrict_uses); bool TrySetPhiReduction(HPhi* phi); // Detects loop header with a single induction (returned in main_phi), possibly // other phis for reductions, but no other side effects. Returns true on success. bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi); bool IsEmptyBody(HBasicBlock* block); bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info, HInstruction* instruction, bool collect_loop_uses, /*out*/ uint32_t* use_count); bool IsUsedOutsideLoop(HLoopInformation* loop_info, HInstruction* instruction); bool TryReplaceWithLastValue(HLoopInformation* loop_info, HInstruction* instruction, HBasicBlock* block); bool TryAssignLastValue(HLoopInformation* loop_info, HInstruction* instruction, HBasicBlock* block, bool collect_loop_uses); void RemoveDeadInstructions(const HInstructionList& list); bool CanRemoveCycle(); // Whether the current 'iset_' is removable. bool IsInPredicatedVectorizationMode() const { return predicated_vectorization_mode_; } // Compiler options (to query ISA features). const CompilerOptions* compiler_options_; // Cached target SIMD vector register size in bytes. const size_t simd_register_size_; // Range information based on prior induction variable analysis. InductionVarRange induction_range_; // Phase-local heap memory allocator for the loop optimizer. Storage obtained // through this allocator is immediately released when the loop optimizer is done. ScopedArenaAllocator* loop_allocator_; // Global heap memory allocator. Used to build HIR. ArenaAllocator* global_allocator_; // Entries into the loop hierarchy representation. The hierarchy resides // in phase-local heap memory. LoopNode* top_loop_; LoopNode* last_loop_; // Temporary bookkeeping of a set of instructions. // Contents reside in phase-local heap memory. ScopedArenaSet* iset_; // Temporary bookkeeping of reduction instructions. Mapping is two-fold: // (1) reductions in the loop-body are mapped back to their phi definition, // (2) phi definitions are mapped to their initial value (updated during // code generation to feed the proper values into the new chain). // Contents reside in phase-local heap memory. ScopedArenaSafeMap* reductions_; // Flag that tracks if any simplifications have occurred. bool simplified_; // Whether to use predicated loop vectorization (e.g. for arm64 SVE target). bool predicated_vectorization_mode_; // Number of "lanes" for selected packed type. uint32_t vector_length_; // Set of array references in the vector loop. // Contents reside in phase-local heap memory. ScopedArenaSet* vector_refs_; // Static or dynamic loop peeling for alignment. uint32_t vector_static_peeling_factor_; const ArrayReference* vector_dynamic_peeling_candidate_; // Dynamic data dependence test of the form a != b. HInstruction* vector_runtime_test_a_; HInstruction* vector_runtime_test_b_; // Mapping used during vectorization synthesis for both the scalar peeling/cleanup // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data // structure maps original instructions into the new instructions. // Contents reside in phase-local heap memory. ScopedArenaSafeMap* vector_map_; // Permanent mapping used during vectorization synthesis. // Contents reside in phase-local heap memory. ScopedArenaSafeMap* vector_permanent_map_; // Temporary vectorization bookkeeping. VectorMode vector_mode_; // synthesis mode HBasicBlock* vector_preheader_; // preheader of the new loop HBasicBlock* vector_header_; // header of the new loop HBasicBlock* vector_body_; // body of the new loop HInstruction* vector_index_; // normalized index of the new loop // Helper for target-specific behaviour for loop optimizations. ArchNoOptsLoopHelper* arch_loop_helper_; friend class LoopOptimizationTest; DISALLOW_COPY_AND_ASSIGN(HLoopOptimization); }; } // namespace art #endif // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_