Skip to content

Commit

Permalink
Reduce memory usage of range extension thunks
Browse files Browse the repository at this point in the history
Previously, we allocate 4 byte struct for each relocation to store a
reference to a thunk in case the relocation needs to branch to a thunk.
Most references are null because most relocations don't need thunks.

This commit changes how we manage references to range extension thunks.
Now, thunk addresses are stored to symbols rather than to relocations.

Linking clang-19 for ARM64 is now ~7% faster than before.
  • Loading branch information
rui314 committed Jan 2, 2025
1 parent 89f3ba8 commit 9fc0ace
Show file tree
Hide file tree
Showing 8 changed files with 133 additions and 160 deletions.
12 changes: 6 additions & 6 deletions src/arch-arm32.cc
Original file line number Diff line number Diff line change
Expand Up @@ -252,14 +252,14 @@ template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);

auto get_tls_trampoline_addr = [&, i = 0](u64 addr) mutable {
for (; i < output_section->thunks.size(); i++) {
auto get_tls_trampoline_addr = [&](u64 addr) {
for (i64 i = 0; i < output_section->thunks.size(); i++) {
i64 disp = output_section->shdr.sh_addr + output_section->thunks[i]->offset -
addr;
if (is_jump_reachable(disp))
if (-branch_distance<E> <= disp && disp < branch_distance<E>)
return disp;
}
unreachable();
abort();
};

for (i64 i = 0; i < rels.size(); i++) {
Expand All @@ -284,8 +284,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
u64 GOT = ctx.got->shdr.sh_addr;

auto get_thumb_thunk_addr = [&] { return get_thunk_addr(i); };
auto get_arm_thunk_addr = [&] { return get_thunk_addr(i) + 4; };
auto get_thumb_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P); };
auto get_arm_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P) + 4; };

switch (rel.r_type) {
case R_ARM_ABS32:
Expand Down
2 changes: 1 addition & 1 deletion src/arch-arm64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {

i64 val = S + A - P;
if (val < -(1 << 27) || (1 << 27) <= val)
val = get_thunk_addr(i) + A - P;
val = sym.get_thunk_addr(ctx, P) + A - P;
*(ul32 *)loc |= bits(val, 27, 2);
break;
}
Expand Down
4 changes: 2 additions & 2 deletions src/arch-ppc32.cc
Original file line number Diff line number Diff line change
Expand Up @@ -214,14 +214,14 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
case R_PPC_LOCAL24PC: {
i64 val = S + A - P;
if (sign_extend(val, 25) != val)
val = get_thunk_addr(i) - P;
val = sym.get_thunk_addr(ctx, P) - P;
*(ub32 *)loc |= bits(val, 25, 2) << 2;
break;
}
case R_PPC_PLTREL24: {
i64 val = S - P;
if (sym.has_plt(ctx) || sign_extend(val, 25) != val)
val = get_thunk_addr(i) - P;
val = sym.get_thunk_addr(ctx, P) - P;
*(ub32 *)loc |= bits(val, 25, 2) << 2;
break;
}
Expand Down
2 changes: 1 addition & 1 deletion src/arch-ppc64v1.cc
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
case R_PPC64_REL24: {
i64 val = sym.get_addr(ctx, NO_OPD) + A - P;
if (sym.has_plt(ctx) || sign_extend(val, 25) != val)
val = get_thunk_addr(i) + A - P;
val = sym.get_thunk_addr(ctx, P) + A - P;

check(val, -(1 << 25), 1 << 25);
*(ub32 *)loc |= bits(val, 25, 2) << 2;
Expand Down
4 changes: 2 additions & 2 deletions src/arch-ppc64v2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
u64 GOT = ctx.got->shdr.sh_addr;
u64 TOC = ctx.extra.TOC->value;

auto r2save_thunk_addr = [&] { return get_thunk_addr(i); };
auto no_r2save_thunk_addr = [&] { return get_thunk_addr(i) + 8; };
auto r2save_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P); };
auto no_r2save_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P) + 8; };

switch (rel.r_type) {
case R_PPC64_TOC16_HA:
Expand Down
4 changes: 4 additions & 0 deletions src/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,10 @@ int mold_main(int argc, char **argv) {

// At this point, both memory and file layouts are fixed.

// Gather thunk symbols and attach them to themselves.
if constexpr (needs_thunk<E>)
gather_thunk_addresses(ctx);

t_before_copy.stop();

// Create an output file
Expand Down
93 changes: 53 additions & 40 deletions src/mold.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,11 @@ struct SymbolAux {
i32 plt_idx = -1;
i32 pltgot_idx = -1;
i32 dynsym_idx = -1;
i32 opd_idx = -1;
u32 djb_hash = 0;
};

template <>
struct SymbolAux<PPC64V1> : SymbolAux<X86_64> {
i32 opd_idx = -1;
// For range extension thunks
std::vector<u64> thunk_addrs;
};

//
Expand All @@ -112,20 +111,41 @@ class Thunk<E> {
idx * E::thunk_size;
}

static constexpr i64 alignment = 16;

OutputSection<E> &output_section;
i64 offset;
std::mutex mu;
std::vector<Symbol<E> *> symbols;
};

struct ThunkRef {
static constexpr i64 MAX_SYM_IDX = (1 << 17) - 1;
template <needs_thunk E> void gather_thunk_addresses(Context<E> &);

i32 thunk_idx : 14 = -1;
i32 sym_idx : 18 = -1;
};
// Returns the maximum branch reach in bytes for a given target.
template <needs_thunk E>
static consteval i64 get_branch_distance() {
// ARM64's branch has 26 bits immediate. The immediate is padded with
// implicit two-bit zeros because all instructions are 4 bytes aligned
// and therefore the least two bits are always zero. So the branch
// operand is effectively 28 bits long. That means the branch range is
// [-2^27, 2^27) or PC ± 128 MiB.
if (is_arm64<E>)
return 1 << 27;

// ARM32's Thumb branch has 24 bits immediate, and the instructions are
// aligned to 2, so it's effectively 25 bits. It's [-2^24, 2^24) or PC ±
// 16 MiB.
//
// ARM32's non-Thumb branches have twice longer range than its Thumb
// counterparts, but we conservatively use the Thumb's limitation.
if (is_arm32<E>)
return 1 << 24;

// PPC's branch has 24 bits immediate, and the instructions are aligned
// to 4, therefore the reach is [-2^25, 2^25) or PC ± 32 MiB.
assert(is_ppc<E>);
return 1 << 25;
}

template <needs_thunk E>
static constexpr i64 branch_distance = get_branch_distance<E>();

//
// input-sections.cc
Expand Down Expand Up @@ -228,11 +248,6 @@ struct FdeRecord {
template <typename E>
struct InputSectionExtras {};

template <needs_thunk E>
struct InputSectionExtras<E> {
std::vector<ThunkRef> thunk_refs;
};

template <typename E> requires is_riscv<E> || is_loongarch<E>
struct InputSectionExtras<E> {
std::vector<i32> r_deltas;
Expand Down Expand Up @@ -318,8 +333,6 @@ class __attribute__((aligned(4))) InputSection {
void apply_toc_rel(Context<E> &ctx, Symbol<E> &sym, const ElfRel<E> &rel,
u8 *loc, u64 S, i64 A, u64 P, ElfRel<E> **dynrel);

u64 get_thunk_addr(i64 idx) requires needs_thunk<E>;

std::optional<u64> get_tombstone(Symbol<E> &sym, SectionFragment<E> *frag);
};

Expand Down Expand Up @@ -2177,17 +2190,6 @@ enum {
NEEDS_PPC_OPD = 1 << 7, // for PPCv1
};

// A struct to hold target-dependent symbol members.
template <typename E>
struct SymbolExtras {};

template <needs_thunk E>
struct SymbolExtras<E> {
// For range extension thunks
i16 thunk_idx = -1;
i16 thunk_sym_idx = -1;
};

// Flags for Symbol<E>::get_addr()
enum {
NO_PLT = 1 << 0, // Request an address other than .plt
Expand Down Expand Up @@ -2247,6 +2249,9 @@ class Symbol {
u32 get_djb_hash(Context<E> &ctx) const;
void set_djb_hash(Context<E> &ctx, u32 hash);

void add_thunk_addr(Context<E> &ctx, u64 addr) requires needs_thunk<E>;
u64 get_thunk_addr(Context<E> &ctx, u64 P) const requires needs_thunk<E>;

bool is_absolute() const;
bool is_relative() const { return !is_absolute(); }
bool is_local(Context<E> &ctx) const;
Expand Down Expand Up @@ -2445,9 +2450,6 @@ class Symbol {

// If true, we try to dmenagle the sybmol when printing.
bool demangle : 1 = false;

// Target-dependent extra members.
[[no_unique_address]] SymbolExtras<E> extra;
};

template <typename E>
Expand Down Expand Up @@ -2560,13 +2562,6 @@ InputSection<E>::get_fragment(Context<E> &ctx, const ElfRel<E> &rel) {
return {p.first, p.second + get_addend(*this, rel)};
}

template <typename E>
u64 InputSection<E>::get_thunk_addr(i64 idx) requires needs_thunk<E> {
ThunkRef ref = extra.thunk_refs[idx];
assert(ref.thunk_idx != -1);
return output_section->thunks[ref.thunk_idx]->get_addr(ref.sym_idx);
}

// Input object files may contain duplicate code for inline functions
// and such. Linkers de-duplicate them at link-time. However, linkers
// generaly don't remove debug info for de-duplicated functions because
Expand Down Expand Up @@ -2945,6 +2940,24 @@ inline void Symbol<E>::set_djb_hash(Context<E> &ctx, u32 hash) {
ctx.symbol_aux[aux_idx].djb_hash = hash;
}

template <typename E>
void Symbol<E>::add_thunk_addr(Context<E> &ctx, u64 addr) requires needs_thunk<E> {
add_aux(ctx);
ctx.symbol_aux[aux_idx].thunk_addrs.push_back(addr);
}

template <typename E>
u64
Symbol<E>::get_thunk_addr(Context<E> &ctx, u64 P) const requires needs_thunk<E> {
assert(aux_idx != -1);
for (u64 addr : ctx.symbol_aux[aux_idx].thunk_addrs) {
i64 disp = addr - P;
if (-branch_distance<E> <= disp && disp < branch_distance<E>)
return addr;
}
abort();
}

template <typename E>
inline bool Symbol<E>::has_plt(Context<E> &ctx) const {
return get_plt_idx(ctx) != -1 || get_pltgot_idx(ctx) != -1;
Expand Down
Loading

0 comments on commit 9fc0ace

Please sign in to comment.