Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 115 additions & 21 deletions src/passes/SimplifyLocals.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
#include <ir/local-utils.h>
#include <ir/manipulation.h>
#include <ir/utils.h>
#include <optional>
#include <pass.h>
#include <wasm-builder.h>
#include <wasm-traversal.h>
Expand Down Expand Up @@ -91,6 +92,74 @@ struct SimplifyLocals
// locals in current linear execution trace, which we try to sink
Sinkables sinkables;

// Effects seen since the last full invalidation flush. We merge them and
// defer scanning all sinkables until a point where we must inspect or move
// the full sinkable set.
std::optional<EffectAnalyzer> pendingInvalidations;

void clearSinkables() {
sinkables.clear();
pendingInvalidations.reset();
}

Sinkables takeSinkables() {
flushPendingInvalidations();
pendingInvalidations.reset();
return std::move(sinkables);
}

void eraseSinkable(typename Sinkables::iterator it) {
sinkables.erase(it);
if (sinkables.empty()) {
pendingInvalidations.reset();
}
}

void eraseSinkable(Index key) {
sinkables.erase(key);
if (sinkables.empty()) {
pendingInvalidations.reset();
}
}

void mergePendingInvalidations(const EffectAnalyzer& effects) {
if (sinkables.empty()) {
pendingInvalidations.reset();
return;
}
if (!pendingInvalidations) {
pendingInvalidations.emplace(this->getPassOptions(), *this->getModule());
}
pendingInvalidations->mergeIn(effects);
}

void resolvePendingInvalidations(Index index) {
if (!pendingInvalidations) {
return;
}
auto it = sinkables.find(index);
if (it != sinkables.end() &&
pendingInvalidations->orderedAfter(it->second.effects)) {
eraseSinkable(it);
}
}

void flushPendingInvalidations() {
if (!pendingInvalidations) {
return;
}
if (!sinkables.empty()) {
checkInvalidations(*pendingInvalidations);
}
pendingInvalidations.reset();
}

void addSinkable(Index key, Expression** currp) {
flushPendingInvalidations();
sinkables.emplace(std::pair{
key, SinkableInfo(currp, this->getPassOptions(), *this->getModule())});
}

// Information about an exit from a block: the break, and the
// sinkables. For the final exit from a block (falling off)
// exitter is null.
Expand Down Expand Up @@ -135,8 +204,7 @@ struct SimplifyLocals
// value means the block already has a return value
self->unoptimizableBlocks.insert(br->name);
} else {
self->blockBreaks[br->name].push_back(
{currp, std::move(self->sinkables)});
self->blockBreaks[br->name].push_back({currp, self->takeSinkables()});
}
} else if (curr->is<Block>()) {
return; // handled in visitBlock
Expand All @@ -153,15 +221,15 @@ struct SimplifyLocals
}
// TODO: we could use this info to stop gathering data on these blocks
}
self->sinkables.clear();
self->clearSinkables();
}

static void doNoteIfCondition(
SimplifyLocals<allowTee, allowStructure, allowNesting>* self,
Expression** currp) {
// we processed the condition of this if-else, and now control flow branches
// into either the true or the false sides
self->sinkables.clear();
self->clearSinkables();
}

static void
Expand All @@ -170,13 +238,13 @@ struct SimplifyLocals
auto* iff = (*currp)->cast<If>();
if (iff->ifFalse) {
// We processed the ifTrue side of this if-else, save it on the stack.
self->ifStack.push_back(std::move(self->sinkables));
self->ifStack.push_back(self->takeSinkables());
} else {
// This is an if without an else.
if (allowStructure) {
self->optimizeIfReturn(iff, currp);
}
self->sinkables.clear();
self->clearSinkables();
}
}

Expand All @@ -191,10 +259,12 @@ struct SimplifyLocals
self->optimizeIfElseReturn(iff, currp, self->ifStack.back());
}
self->ifStack.pop_back();
self->sinkables.clear();
self->clearSinkables();
}

void visitBlock(Block* curr) {
flushPendingInvalidations();

bool hasBreaks = curr->name.is() && blockBreaks[curr->name].size() > 0;

if (allowStructure) {
Expand All @@ -204,25 +274,29 @@ struct SimplifyLocals
// post-block cleanups
if (curr->name.is()) {
if (unoptimizableBlocks.contains(curr->name)) {
sinkables.clear();
clearSinkables();
unoptimizableBlocks.erase(curr->name);
}

if (hasBreaks) {
// more than one path to here, so nonlinear
sinkables.clear();
clearSinkables();
blockBreaks.erase(curr->name);
}
}
}

void visitLoop(Loop* curr) {
flushPendingInvalidations();

if (allowStructure) {
optimizeLoopReturn(curr);
}
}

void optimizeLocalGet(LocalGet* curr) {
resolvePendingInvalidations(curr->index);

auto found = sinkables.find(curr->index);
if (found != sinkables.end()) {
auto* set = (*found->second.item)
Expand Down Expand Up @@ -284,7 +358,7 @@ struct SimplifyLocals
// reuse the local.get that is dying
*found->second.item = curr;
ExpressionManipulator::nop(curr);
sinkables.erase(found);
eraseSinkable(found);
anotherCycle = true;
}
}
Expand All @@ -300,6 +374,10 @@ struct SimplifyLocals
}

void checkInvalidations(EffectAnalyzer& effects) {
if (sinkables.empty()) {
return;
}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC this tries to be faster by guessing that most interactions are local?

If that's the case, then I don't think that is true in general. We would need to measure on more codebases to check that. If it isn't generally true, then it could make us slower sometimes. It also adds chances for bugs to crop up here, as this code must remain in sync with effects.h.

Copy link
Copy Markdown
Contributor Author

@Changqing-JING Changqing-JING Apr 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kripken thank you for review

Updated the PR: the fast path now classifies the current expression (not sinkables) using only existing EffectAnalyzer fields — no new methods in effects.h, nothing to keep in sync. When the expression only touches locals, we use reverse indices (localReadBySinkable, localWrittenBySinkable) to look up exactly the conflicting sinkables in O(|locals touched|) instead of scanning all sinkables; otherwise we fall back to the exact original loop, so worst case is identical to main. Benchmark still shows ~1.9x speedup.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First, there is still something to keep in sync: the code

    if (!effects.transfersControlFlow() && !effects.writesGlobalState() &&
        !effects.readsMutableGlobalState() && !effects.danglingPop &&
        !effects.trap && !effects.hasSynchronization() &&
        !effects.mayNotReturn

If, say, we added some new type of effect there, we'd need to add it to this list of all possible effects. So this PR adds complexity and increases the risk of bugs, in my opinion.

If this is a huge speedup it might be worth those downsides. However, I still have my concern from before, which is that this is a speedup for the case where local interactions are the most common thing, but that is not generally true. There could be codebases where e.g. GC struct/array interactions are more common. That is, there is nothing fundamental about local interactions that justifies focusing on them, AFAICT - this pass happens to optimize locals, but that's a coincidence?

Taking a step back, if this pattern of local effects is very common, there may be a better way to optimize it. That they are local effects and nothing else suggests things like copies,

(local.set $x (local.get $y))

or trivial, effect-less operations

(local.set $x (i32.eqz (local.get $y)))

I would like to understand which of those (or something else) is very common in the benchmark you are looking at. Perhaps we can add a tailored optimization for it. E.g. if copies are the issue, then running coalesce-locals first (which can remove many copies) might be even faster than this PR.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


I instrumented checkInvalidations on the benchmark. The 236M orderedAfter calls break down as:

  • 62% local-only (local.get/set, not copies — just normal operand reads/writes)
  • 37% trap/globalState
  • 0.3% GC struct/array

Local-only expressions dominate because every wasm function is full of local.get/set regardless of what higher-level ops it uses. The fast path reduces those 147M to 100K via reverse index (99.9% reduction).

Re sync concern: happy to move the check into EffectAnalyzer as a hasOnlyLocalEffects() method so there's one place to maintain. Or open to other approaches.


Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you share the code you used to measure those figures? I'm curious to check this on a variety of other wasm files.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I have pushed the instrumentation code in this pr.
To use it

./build/bin/wasm-opt --simplify-locals -all  ./test3.wasm -o /dev/null
python ./scripts/analyze_ci_stats.py

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kripken
What's your opinion to continue with this PR?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for not getting back to this sooner.

I did some measurements now, because my intuition is still that this is going to depend on the codebase. I tried it on a large Java testcase:

without patch with patch
instructions 6,944,304,703 +- 0.01% 7,580,744,634 +- 0.01%
branches 1,443,675,249 +- 0.02% 1,601,024,431 +- 0.01%
time 0.44 +- 0.64% 0.50 +- 1.63%

(this is just your first commit, of course - not with the instrumentation)

In this code, there are fewer local interactions, I suppose, and more other things (likely GC load/store overlaps). So all the extra bookkeeping this PR adds just add overhead.

As the patch adds a lot of complexity and maintenance burden, I don't think it makes sense to land. But I hope we can find general ways of speeding this up, that work on all or at least most wasms. To do that, we need to have a good rationale, I think, a reason why a fast path would be common, and local operations don't have a good reason to be exceptional here (and are not in practice, as this data shows).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kripken

I changed to another concept

This keeps the optimization inside SimplifyLocals: it batches invalidating effects and only flushes when the full sinkable set must be exact. For common local.get/ local.set cases, it checks just that one local, reducing repeated full scans without adding effects.h coupling.

Perf on test3.wasm

time ./build/bin/wasm-opt --simplify-locals --enable-bulk-memory --enable-multivalue --enable-reference-types --enable-gc --enable-tail-call --enable-exception-handling  -o /dev/null ./test3.wasm

real    0m26.991s
user    6m31.875s
sys     0m15.321s

Regression test on kotlin case

Main

 Performance counter stats for 'build/bin/wasm-opt 73cbe24d7cf5a54d37ad.wasm -all --simplify-locals -o /dev/null' (10 runs):

        8622250653      task-clock                       #    2.001 CPUs utilized               ( +-  0.56% )
               137      context-switches                 #   15.889 /sec                        ( +-  7.72% )
                 7      cpu-migrations                   #    0.812 /sec                        ( +- 13.88% )
             46391      page-faults                      #    5.380 K/sec                       ( +-  0.00% )
       25515064602      instructions                     #    1.29  insn per cycle              ( +-  0.00% )
       19828672694      cycles                           #    2.300 GHz                         ( +-  0.56% )
         114856501      branch-misses                                                           ( +-  1.13% )

            4.3082 +- 0.0263 seconds time elapsed  ( +-  0.61% )

This PR

Performance counter stats for 'build/bin/wasm-opt 73cbe24d7cf5a54d37ad.wasm -all --simplify-locals -o /dev/null' (10 runs):

        8320081458      task-clock                       #    1.952 CPUs utilized               ( +-  0.40% )
               118      context-switches                 #   14.183 /sec                        ( +-  3.17% )
                 8      cpu-migrations                   #    0.962 /sec                        ( +- 19.11% )
             46394      page-faults                      #    5.576 K/sec                       ( +-  0.00% )
       23979738215      instructions                     #    1.25  insn per cycle              ( +-  0.00% )
       19133790368      cycles                           #    2.300 GHz                         ( +-  0.40% )
         115928198      branch-misses                                                           ( +-  1.41% )

            4.2626 +- 0.0266 seconds time elapsed  ( +-  0.62% )


// TODO: this is O(bad)
std::vector<Index> invalidated;
for (auto& [index, info] : sinkables) {
Expand All @@ -308,7 +386,7 @@ struct SimplifyLocals
}
}
for (auto index : invalidated) {
sinkables.erase(index);
eraseSinkable(index);
}
}

Expand All @@ -321,9 +399,17 @@ struct SimplifyLocals
Expression** currp) {
Expression* curr = *currp;

if (self->sinkables.empty()) {
if (!allowNesting) {
self->expressionStack.push_back(curr);
}
return;
}

// Certain expressions cannot be sinked into 'try'/'try_table', and so at
// the start of 'try'/'try_table' we forget about them.
if (curr->is<Try>() || curr->is<TryTable>()) {
self->flushPendingInvalidations();
std::vector<Index> invalidated;
for (auto& [index, info] : self->sinkables) {
// Expressions that may throw cannot be moved into a try (which might
Expand All @@ -334,13 +420,13 @@ struct SimplifyLocals
}
}
for (auto index : invalidated) {
self->sinkables.erase(index);
self->eraseSinkable(index);
}
}

EffectAnalyzer effects(self->getPassOptions(), *self->getModule());
if (effects.checkPre(curr)) {
self->checkInvalidations(effects);
self->mergePendingInvalidations(effects);
}

if (!allowNesting) {
Expand Down Expand Up @@ -409,6 +495,8 @@ struct SimplifyLocals
auto* set = (*currp)->dynCast<LocalSet>();

if (set) {
self->resolvePendingInvalidations(set->index);

// if we see a set that was already potentially-sinkable, then the
// previous store is dead, leave just the value
auto found = self->sinkables.find(set->index);
Expand All @@ -419,22 +507,20 @@ struct SimplifyLocals
Drop* drop = ExpressionManipulator::convert<LocalSet, Drop>(previous);
drop->value = previousValue;
drop->finalize();
self->sinkables.erase(found);
self->eraseSinkable(found);
self->anotherCycle = true;
}
}

EffectAnalyzer effects(self->getPassOptions(), *self->getModule());
if (effects.checkPost(original)) {
self->checkInvalidations(effects);
self->mergePendingInvalidations(effects);
}

if (set && self->canSink(set)) {
Index index = set->index;
assert(!self->sinkables.contains(index));
self->sinkables.emplace(std::pair{
index,
SinkableInfo(currp, self->getPassOptions(), *self->getModule())});
self->addSinkable(index, currp);
}

if (!allowNesting) {
Expand Down Expand Up @@ -468,6 +554,8 @@ struct SimplifyLocals
std::vector<Loop*> loopsToEnlarge;

void optimizeLoopReturn(Loop* loop) {
flushPendingInvalidations();

// If there is a sinkable thing in an eligible loop, we can optimize
// it in a trivial way to the outside of the loop.
if (loop->type != Type::none) {
Expand Down Expand Up @@ -498,11 +586,13 @@ struct SimplifyLocals
this->replaceCurrent(set);
// We moved things around, clear all tracking; we'll do another cycle
// anyhow.
sinkables.clear();
clearSinkables();
anotherCycle = true;
}

void optimizeBlockReturn(Block* block) {
flushPendingInvalidations();

if (!block->name.is() || unoptimizableBlocks.contains(block->name)) {
return;
}
Expand Down Expand Up @@ -624,13 +714,15 @@ struct SimplifyLocals
auto* newLocalSet =
Builder(*this->getModule()).makeLocalSet(sharedIndex, block);
this->replaceCurrent(newLocalSet);
sinkables.clear();
clearSinkables();
anotherCycle = true;
block->finalize();
}

// optimize local.sets from both sides of an if into a return value
void optimizeIfElseReturn(If* iff, Expression** currp, Sinkables& ifTrue) {
flushPendingInvalidations();

assert(iff->ifFalse);
// if this if already has a result, or is unreachable code, we have
// nothing to do
Expand Down Expand Up @@ -753,6 +845,8 @@ struct SimplifyLocals
// that happens, other passes can "undo" this by turning an if with a copy
// arm into a one-sided if.
void optimizeIfReturn(If* iff, Expression** currp) {
flushPendingInvalidations();

// If this if is unreachable code, we have nothing to do.
if (iff->type != Type::none || iff->ifTrue->type != Type::none) {
return;
Expand Down Expand Up @@ -973,7 +1067,7 @@ struct SimplifyLocals
anotherCycle = true;
}
// clean up
sinkables.clear();
clearSinkables();
blockBreaks.clear();
unoptimizableBlocks.clear();
return anotherCycle;
Expand Down
Loading