Skip to content

[lld][LoongArch] GOT indirection to PC relative optimization #123743

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: users/ylzsx/r-tlsdesc-to-iele-relax
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
244 changes: 243 additions & 1 deletion lld/ELF/Arch/LoongArch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,15 @@ class LoongArch final : public TargetInfo {
void relocate(uint8_t *loc, const Relocation &rel,
uint64_t val) const override;
bool relaxOnce(int pass) const override;
RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override;
void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override;
void finalizeRelax(int passes) const override;

private:
void tlsdescToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
bool tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
const Relocation &rLo12, uint64_t secAddr) const;
};
} // end anonymous namespace

Expand All @@ -58,6 +65,7 @@ enum Op {
LU12I_W = 0x14000000,
PCADDI = 0x18000000,
PCADDU12I = 0x1c000000,
PCALAU12I = 0x1a000000,
LD_W = 0x28800000,
LD_D = 0x28c00000,
JIRL = 0x4c000000,
Expand All @@ -69,6 +77,7 @@ enum Reg {
R_ZERO = 0,
R_RA = 1,
R_TP = 2,
R_A0 = 4,
R_T0 = 12,
R_T1 = 13,
R_T2 = 14,
Expand Down Expand Up @@ -959,11 +968,18 @@ static bool relax(Ctx &ctx, InputSection &sec) {
case R_LARCH_GOT_PC_HI20:
case R_LARCH_TLS_GD_PC_HI20:
case R_LARCH_TLS_LD_PC_HI20:
case R_LARCH_TLS_DESC_PC_HI20:
// The overflow check for i+2 will be carried out in isPairRelaxable.
if (isPairRelaxable(relocs, i))
relaxPCHi20Lo12(ctx, sec, i, loc, r, relocs[i + 2], remove);
break;
case R_LARCH_TLS_DESC_PC_HI20:
if (r.expr == RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC ||
r.expr == R_RELAX_TLS_GD_TO_LE) {
if (relaxable(relocs, i))
remove = 4;
} else if (isPairRelaxable(relocs, i))
relaxPCHi20Lo12(ctx, sec, i, loc, r, relocs[i + 2], remove);
break;
case R_LARCH_CALL36:
if (relaxable(relocs, i))
relaxCall36(ctx, sec, i, loc, r, remove);
Expand All @@ -979,6 +995,17 @@ static bool relax(Ctx &ctx, InputSection &sec) {
isUInt<12>(r.sym->getVA(ctx, r.addend)))
remove = 4;
break;
case R_LARCH_TLS_DESC_PC_LO12:
if (relaxable(relocs, i) &&
(r.expr == RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC ||
r.expr == R_RELAX_TLS_GD_TO_LE))
remove = 4;
break;
case R_LARCH_TLS_DESC_LD:
if (relaxable(relocs, i) && r.expr == R_RELAX_TLS_GD_TO_LE &&
isUInt<12>(r.sym->getVA(ctx, r.addend)))
remove = 4;
break;
}

// For all anchors whose offsets are <= r.offset, they are preceded by
Expand Down Expand Up @@ -1046,6 +1073,155 @@ static void tlsIeToLe(uint8_t *loc, const Relocation &rel, uint64_t val) {
}
}

// Convert TLSDESC GD/LD to IE.
// In normal or medium code model, there are two forms of code sequences:
// * pcalau12i $a0, %desc_pc_hi20(sym_desc)
// * addi.d $a0, $a0, %desc_pc_lo12(sym_desc)
// * ld.d $ra, $a0, %desc_ld(sym_desc)
// * jirl $ra, $ra, %desc_call(sym_desc)
// ------
// * pcaddi $a0, %desc_pcrel_20(a)
// * load $ra, $a0, %desc_ld(a)
// * jirl $ra, $ra, %desc_call(a)
//
// The code sequence obtained is as follows:
// * pcalau12i $a0, %ie_pc_hi20(sym_ie)
// * ld.[wd] $a0, $a0, %ie_pc_lo12(sym_ie)
//
// Simplicity, whether tlsdescToIe or tlsdescToLe, we always tend to convert the
// preceding instructions to NOPs, due to both forms of code sequence
// (corresponding to relocation combinations:
// R_LARCH_TLS_DESC_PC_HI20+R_LARCH_TLS_DESC_PC_LO12 and
// R_LARCH_TLS_DESC_PCREL20_S2) have same process.
//
// When relaxation enables, redundant NOPs can be removed.
void LoongArch::tlsdescToIe(uint8_t *loc, const Relocation &rel,
uint64_t val) const {
switch (rel.type) {
case R_LARCH_TLS_DESC_PC_HI20:
case R_LARCH_TLS_DESC_PC_LO12:
case R_LARCH_TLS_DESC_PCREL20_S2:
write32le(loc, insn(ANDI, R_ZERO, R_ZERO, 0)); // nop
break;
case R_LARCH_TLS_DESC_LD:
write32le(loc, insn(PCALAU12I, R_A0, 0, 0)); // pcalau12i $a0, %ie_pc_hi20
relocateNoSym(loc, R_LARCH_TLS_IE_PC_HI20, val);
break;
case R_LARCH_TLS_DESC_CALL:
write32le(loc, insn(ctx.arg.is64 ? LD_D : LD_W, R_A0, R_A0,
0)); // ld.[wd] $a0, $a0, %ie_pc_lo12
relocateNoSym(loc, R_LARCH_TLS_IE_PC_LO12, val);
break;
default:
llvm_unreachable("unsupported relocation for TLSDESC to IE");
}
}

// Convert TLSDESC GD/LD to LE.
// The code sequence obtained in the normal or medium code model is as follows:
// * lu12i.w $a0, %le_hi20(sym_le) # le_hi20 != 0
// * ori $a0 $a0, %le_lo12(sym_le)
// See the comment in tlsdescToIe for detailed information.
void LoongArch::tlsdescToLe(uint8_t *loc, const Relocation &rel,
uint64_t val) const {
assert(isInt<32>(val) &&
"val exceeds the range of medium code model in tlsdescToLe");

bool isUInt12 = isUInt<12>(val);
switch (rel.type) {
case R_LARCH_TLS_DESC_PC_HI20:
case R_LARCH_TLS_DESC_PC_LO12:
case R_LARCH_TLS_DESC_PCREL20_S2:
write32le(loc, insn(ANDI, R_ZERO, R_ZERO, 0)); // nop
break;
case R_LARCH_TLS_DESC_LD:
if (isUInt12)
write32le(loc, insn(ANDI, R_ZERO, R_ZERO, 0)); // nop
else
write32le(loc, insn(LU12I_W, R_A0, extractBits(val, 31, 12),
0)); // lu12i.w $a0, %le_hi20
break;
case R_LARCH_TLS_DESC_CALL:
if (isUInt12)
write32le(loc, insn(ORI, R_A0, R_ZERO, val)); // ori $a0, $r0, %le_lo12
else
write32le(loc,
insn(ORI, R_A0, R_A0, lo12(val))); // ori $a0, $a0, %le_lo12
break;
default:
llvm_unreachable("unsupported relocation for TLSDESC to LE");
}
}

// Try GOT indirection to PC relative optimization when relaxation is enabled.
// From:
// * pcalau12i $a0, %got_pc_hi20(sym_got)
// * ld.w/d $a0, $a0, %got_pc_lo12(sym_got)
// To:
// * pcalau12i $a0, %pc_hi20(sym)
// * addi.w/d $a0, $a0, %pc_lo12(sym)
//
// Note: Althouth the optimization has been performed, the GOT entries still
// exists, similarly to AArch64. Eliminating the entries will increase code
// complexity.
bool LoongArch::tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
const Relocation &rLo12, uint64_t secAddr) const {
if (!rHi20.sym->isDefined() || rHi20.sym->isPreemptible ||
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need symbol tests to test each condition here. aarch64-adrp-ldr-got-symbols.s has an example

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I have added this test in a previous patch(#123566).

rHi20.sym->isGnuIFunc() ||
(ctx.arg.isPic && !cast<Defined>(*rHi20.sym).section))
return false;

Symbol &sym = *rHi20.sym;
uint64_t symLocal = sym.getVA(ctx) + rHi20.addend;
// Check if the address difference is within +/-2GB range.
// For simplicity, the range mentioned here is an approximate estimate and is
// not fully equivalent to the entire region that PC-relative addressing can
// cover.
int64_t pageOffset =
getLoongArchPage(symLocal) - getLoongArchPage(secAddr + rHi20.offset);
if (!isInt<20>(pageOffset >> 12))
return false;

Relocation newRHi20 = {RE_LOONGARCH_PAGE_PC, R_LARCH_PCALA_HI20, rHi20.offset,
rHi20.addend, &sym};
Relocation newRLo12 = {R_ABS, R_LARCH_PCALA_LO12, rLo12.offset, rLo12.addend,
&sym};

const uint32_t currInsn = read32le(loc);
const uint32_t nextInsn = read32le(loc + 4);
// Check if use the same register.
if (getD5(currInsn) != getJ5(nextInsn) || getJ5(nextInsn) != getD5(nextInsn))
return false;

uint64_t pageDelta =
getLoongArchPageDelta(symLocal, secAddr + rHi20.offset, rHi20.type);
// pcalau12i $a0, %pc_hi20
write32le(loc, insn(PCALAU12I, getD5(currInsn), 0, 0));
relocate(loc, newRHi20, pageDelta);
// addi.w/d $a0, $a0, %pc_lo12
write32le(loc + 4, insn(ctx.arg.is64 ? ADDI_D : ADDI_W, getD5(nextInsn),
getJ5(nextInsn), 0));
relocate(loc + 4, newRLo12, SignExtend64(symLocal, 64));
return true;
}

// During TLSDESC GD_TO_IE, the converted code sequence always includes an
// instruction related to the Lo12 relocation (ld.[wd]). To obtain correct val
// in `getRelocTargetVA`, expr of this instruction should be adjusted to
// R_RELAX_TLS_GD_TO_IE_ABS, while expr of other instructions related to the
// Hi20 relocation (pcalau12i) should be adjusted to
// RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC. Specifically, in the normal or
// medium code model, the instruction with relocation R_LARCH_TLS_DESC_CALL is
// the candidate of Lo12 relocation.
RelExpr LoongArch::adjustTlsExpr(RelType type, RelExpr expr) const {
if (expr == R_RELAX_TLS_GD_TO_IE) {
if (type != R_LARCH_TLS_DESC_CALL)
return RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC;
return R_RELAX_TLS_GD_TO_IE_ABS;
}
return expr;
}

void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
const unsigned bits = ctx.arg.is64 ? 64 : 32;
uint64_t secAddr = sec.getOutputSection()->addr;
Expand Down Expand Up @@ -1088,6 +1264,72 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
tlsIeToLe(loc, rel, val);
}
continue;
case RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC:
if (rel.type == R_LARCH_TLS_DESC_PC_HI20) {
// LoongArch does not support TLSDESC GD/LD to LE/IE optimization in the
// extreme code model. In these cases, the relocs are as follows:
//
// * i -- R_LARCH_TLS_DESC_PC_HI20
// * i+1 -- R_LARCH_TLS_DESC_PC_LO12
// * i+2 -- R_LARCH_TLS_DESC64_PC_LO20
// * i+3 -- R_LARCH_TLS_DESC64_PC_HI12
isExtreme =
(i + 2 < size && relocs[i + 2].type == R_LARCH_TLS_DESC64_PC_LO20);
}
[[fallthrough]];
case R_RELAX_TLS_GD_TO_IE_ABS:
if (isExtreme) {
if (rel.type == R_LARCH_TLS_DESC_CALL)
continue;
rel.expr = getRelExpr(rel.type, *rel.sym, loc);
val = SignExtend64(sec.getRelocTargetVA(ctx, rel, secAddr + rel.offset),
bits);
relocateNoSym(loc, rel.type, val);
} else {
isRelax = relaxable(relocs, i);
if (isRelax && (rel.type == R_LARCH_TLS_DESC_PC_HI20 ||
rel.type == R_LARCH_TLS_DESC_PC_LO12))
continue;
tlsdescToIe(loc, rel, val);
}
continue;
case R_RELAX_TLS_GD_TO_LE:
if (rel.type == R_LARCH_TLS_DESC_PC_HI20) {
isExtreme =
(i + 2 < size && relocs[i + 2].type == R_LARCH_TLS_DESC64_PC_LO20);
}
if (isExtreme) {
if (rel.type == R_LARCH_TLS_DESC_CALL)
continue;
rel.expr = getRelExpr(rel.type, *rel.sym, loc);
val = SignExtend64(sec.getRelocTargetVA(ctx, rel, secAddr + rel.offset),
bits);
relocateNoSym(loc, rel.type, val);
} else {
isRelax = relaxable(relocs, i);
if (isRelax && (rel.type == R_LARCH_TLS_DESC_PC_HI20 ||
rel.type == R_LARCH_TLS_DESC_PC_LO12 ||
(rel.type == R_LARCH_TLS_DESC_LD && isUInt<12>(val))))
continue;
tlsdescToLe(loc, rel, val);
}
continue;
case RE_LOONGARCH_GOT_PAGE_PC:
// In LoongArch, we try GOT indirection to PC relative optimization only
// when relaxation is enabled. This approach avoids determining whether
// relocation types are paired and whether the destination register of
// pcalau12i is only used by the immediately following instruction.
// Moreover, if the original code sequence can be relaxed to a single
// instruction `pcaddi`, the first instruction will be removed and it will
// not reach here.
if (isPairRelaxable(relocs, i) && rel.type == R_LARCH_GOT_PC_HI20 &&
relocs[i + 2].type == R_LARCH_GOT_PC_LO12 &&
tryGotToPCRel(loc, rel, relocs[i + 2], secAddr)) {
i = i + 3; // skip relocations R_LARCH_RELAX, R_LARCH_GOT_PC_LO12,
// R_LARCH_RELAX
continue;
}
break;
default:
break;
}
Expand Down
1 change: 1 addition & 0 deletions lld/ELF/InputSection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,7 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r,
case R_GOTPLT_PC:
return r.sym->getGotPltVA(ctx) + a - p;
case RE_LOONGARCH_GOT_PAGE_PC:
case RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC:
if (r.sym->hasFlag(NEEDS_TLSGD))
return getLoongArchPageDelta(ctx.in.got->getGlobalDynAddr(*r.sym) + a, p,
r.type);
Expand Down
50 changes: 33 additions & 17 deletions lld/ELF/Relocations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1346,22 +1346,10 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
if (ctx.arg.emachine == EM_MIPS)
return handleMipsTlsRelocation(ctx, type, sym, *sec, offset, addend, expr);

// LoongArch does not yet implement transition from TLSDESC to LE/IE, so
// generate TLSDESC dynamic relocation for the dynamic linker to handle.
if (ctx.arg.emachine == EM_LOONGARCH &&
oneof<RE_LOONGARCH_TLSDESC_PAGE_PC, R_TLSDESC, R_TLSDESC_PC,
R_TLSDESC_CALL>(expr)) {
if (expr != R_TLSDESC_CALL) {
sym.setFlags(NEEDS_TLSDESC);
sec->addReloc({expr, type, offset, addend, &sym});
}
return 1;
}

bool isRISCV = ctx.arg.emachine == EM_RISCV;

if (oneof<RE_AARCH64_TLSDESC_PAGE, R_TLSDESC, R_TLSDESC_CALL, R_TLSDESC_PC,
R_TLSDESC_GOTPLT>(expr) &&
R_TLSDESC_GOTPLT, RE_LOONGARCH_TLSDESC_PAGE_PC>(expr) &&
ctx.arg.shared) {
// R_RISCV_TLSDESC_{LOAD_LO12,ADD_LO12_I,CALL} reference a label. Do not
// set NEEDS_TLSDESC on the label.
Expand All @@ -1375,10 +1363,14 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
return 1;
}

// LoongArch supports IE to LE optimization in non-extreme code model.
// LoongArch supports IE to LE, DESC GD/LD to IE/LE optimizations in
// non-extreme code model.
bool execOptimizeInLoongArch =
ctx.arg.emachine == EM_LOONGARCH &&
(type == R_LARCH_TLS_IE_PC_HI20 || type == R_LARCH_TLS_IE_PC_LO12);
(type == R_LARCH_TLS_IE_PC_HI20 || type == R_LARCH_TLS_IE_PC_LO12 ||
type == R_LARCH_TLS_DESC_PC_HI20 || type == R_LARCH_TLS_DESC_PC_LO12 ||
type == R_LARCH_TLS_DESC_LD || type == R_LARCH_TLS_DESC_CALL ||
type == R_LARCH_TLS_DESC_PCREL20_S2);

// ARM, Hexagon, LoongArch and RISC-V do not support GD/LD to IE/LE
// optimizations.
Expand Down Expand Up @@ -1437,9 +1429,23 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
return 1;
}

// LoongArch does not support transition from TLSDESC to LE/IE in the extreme
// code model, in which NEEDS_TLSDESC should set, rather than NEEDS_TLSGD. So
// we check independently.
if (ctx.arg.emachine == EM_LOONGARCH &&
oneof<RE_LOONGARCH_TLSDESC_PAGE_PC, R_TLSDESC, R_TLSDESC_PC,
R_TLSDESC_CALL>(expr) &&
!execOptimize) {
if (expr != R_TLSDESC_CALL) {
sym.setFlags(NEEDS_TLSDESC);
sec->addReloc({expr, type, offset, addend, &sym});
}
return 1;
}

if (oneof<RE_AARCH64_TLSDESC_PAGE, R_TLSDESC, R_TLSDESC_CALL, R_TLSDESC_PC,
R_TLSDESC_GOTPLT, R_TLSGD_GOT, R_TLSGD_GOTPLT, R_TLSGD_PC,
RE_LOONGARCH_TLSGD_PAGE_PC>(expr)) {
RE_LOONGARCH_TLSGD_PAGE_PC, RE_LOONGARCH_TLSDESC_PAGE_PC>(expr)) {
if (!execOptimize) {
sym.setFlags(NEEDS_TLSGD);
sec->addReloc({expr, type, offset, addend, &sym});
Expand All @@ -1453,7 +1459,17 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
// label, so TLSDESC=>IE will be categorized as R_RELAX_TLS_GD_TO_LE. We fix
// the categorization in RISCV::relocateAllosec->
if (sym.isPreemptible) {
sym.setFlags(NEEDS_TLSGD_TO_IE);
// In LoongArch, TLSDESC code sequences share relocations
// R_LARCH_TLS_DESC_PC_HI20 and R_LARCH_TLS_DESC_PC_LO12 in
// normal/medium/extreme code model. Since the extreme code model cannot
// be optimized to IE/LE, the flag NEEDS_TLSGD_TO_IE added previously
// needs to be cleared.
// In extreme code model, R_LARCH_TLS_DESC64_LO20 and
// R_LARCH_TLS_DESC64_HI12 will set NEEDS_TLSDESC flag.
if (ctx.arg.emachine == EM_LOONGARCH && sym.hasFlag(NEEDS_TLSDESC))
sym.clearFlags(NEEDS_TLSGD_TO_IE);
else
sym.setFlags(NEEDS_TLSGD_TO_IE);
sec->addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_IE),
type, offset, addend, &sym});
} else {
Expand Down
1 change: 1 addition & 0 deletions lld/ELF/Relocations.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ enum RelExpr {
RE_LOONGARCH_GOT_PAGE_PC,
RE_LOONGARCH_TLSGD_PAGE_PC,
RE_LOONGARCH_TLSDESC_PAGE_PC,
RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC,
};

// Architecture-neutral representation of relocation.
Expand Down
Loading