[PATCH] LoongArch: Batch-delete bytes at the end of each relax trip
WANG Xuerui
i.swmail@xen0n.name
Fri Jun 13 15:15:14 GMT 2025
More information about the Binutils mailing list
Fri Jun 13 15:15:14 GMT 2025
- Previous message (by thread): Inclusion of Custom NOLOAD Section in pei-x86-64 Output Binary
- Next message (by thread): [PATCH] LoongArch: Batch-delete bytes at the end of each relax trip
- Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
From: WANG Xuerui <git@xen0n.name> Previously, memmove and reloc/symbol adjustments happened at each loongarch_relax_delete_bytes() call, which is O(n^2) time complexity and leads to unacceptable (multiple hours) linking times for certain inputs with huge number of relaxable sites -- see the linked issue for details. To get rid of the quadratic behavior, defer all delete ops to the end of each relax trip, with the buffer implemented with the splay tree from libiberty. The individual relaxation handlers are converted to handle symbol values and relocation offsets as if all preceding deletions actually happened; because accesses during a relaxation trip are mostly sequential accesses, the splay tree should be efficient. The relaxation behavior should remain unchanged and has been confirmed with manually inserted debug logs. Example running times before and after the change with the test case in the linked issue (mypy transpiled C), cross-linking on Threadripper 3990X: Before: 4192.80s user 1.09s system 98% cpu 1:10:53.52 total After: 1.76s user 0.74s system 98% cpu 2.539 total - ~1/2382 the time! Also tested with binutils (bootstrapping self), CPython 3.14 and LLVM 20.1.6; all passed the respective test suites. Link: https://github.com/loongson-community/discussions/issues/56 Signed-off-by: WANG Xuerui <git@xen0n.name> --- bfd/elfnn-loongarch.c | 316 +++++++++++++++++++++++++++++++++++------- 1 file changed, 266 insertions(+), 50 deletions(-) diff --git a/bfd/elfnn-loongarch.c b/bfd/elfnn-loongarch.c index 5c504f2cad8..15f10d678ed 100644 --- a/bfd/elfnn-loongarch.c +++ b/bfd/elfnn-loongarch.c @@ -25,6 +25,7 @@ #define ARCH_SIZE NN #include "elf-bfd.h" #include "objalloc.h" +#include "splay-tree.h" #include "elf/loongarch.h" #include "elfxx-loongarch.h" #include "opcode/loongarch.h" @@ -134,6 +135,10 @@ struct loongarch_elf_link_hash_table a partially updated state (some sections have vma updated but the others do not), and it's unsafe to do the normal relaxation. */ bool layout_mutating_for_relr; + + /* Pending relaxation (byte deletion) operations meant for roughly + sequential access. */ + splay_tree pending_delete_ops; }; struct loongarch_elf_section_data @@ -4724,12 +4729,128 @@ loongarch_elf_relocate_section (bfd *output_bfd, struct bfd_link_info *info, return !fatal; } -static bool +/* A pending delete op during a linker relaxation trip, to be stored in a + splay tree. + The key is the starting offset of this op's deletion range, interpreted + as if no delete op is executed for this trip. */ +struct pending_delete_op +{ + /* Number of bytes to delete at the address. */ + bfd_size_type size; + + /* The total offset adjustment at the address as if all preceding delete + ops had been executed. Used to calculate expected addresses after + relaxation without actually adjusting anything. */ + bfd_size_type cumulative_offset; +}; + +static int +pending_delete_op_compare (splay_tree_key a, splay_tree_key b) +{ + bfd_vma off_a = (bfd_vma)a; + bfd_vma off_b = (bfd_vma)b; + + if (off_a < off_b) + return -1; + else if (off_a > off_b) + return 1; + else + return 0; +} + +static splay_tree +pending_delete_ops_new (void) +{ + /* The node values are allocated with bfd_zalloc, so they are automatically + taken care of at BFD release time. */ + return splay_tree_new (pending_delete_op_compare, NULL, NULL); +} + +static bfd_size_type +loongarch_calc_relaxed_addr (struct bfd_link_info *info, bfd_vma offset) +{ + struct loongarch_elf_link_hash_table *htab = loongarch_elf_hash_table (info); + splay_tree pending_delete_ops = htab->pending_delete_ops; + struct pending_delete_op *op; + splay_tree_node node; + + BFD_ASSERT (pending_delete_ops != NULL); + + /* Find the op that starts just before the given address. */ + node = splay_tree_predecessor (pending_delete_ops, (splay_tree_key)offset); + if (node == NULL) + /* Nothing has been deleted yet. */ + return offset; + BFD_ASSERT (((bfd_vma)node->key) < offset); + op = (struct pending_delete_op *)node->value; + + /* If offset is inside this op's range, it is actually one of the deleted + bytes, so the adjusted node->key should be returned in this case. */ + bfd_vma op_end_off = (bfd_vma)node->key + op->size; + if (offset < op_end_off) + { + offset = (bfd_vma)node->key; + node = splay_tree_predecessor (pending_delete_ops, node->key); + op = node ? (struct pending_delete_op *)node->value : NULL; + } + + return offset - (op ? op->cumulative_offset : 0); +} + +static void loongarch_relax_delete_bytes (bfd *abfd, - asection *sec, bfd_vma addr, size_t count, struct bfd_link_info *link_info) +{ + struct loongarch_elf_link_hash_table *htab = + loongarch_elf_hash_table (link_info); + splay_tree pending_delete_ops = htab->pending_delete_ops; + splay_tree_node node; + struct pending_delete_op *op = NULL, *new_op = NULL; + bool need_new_node = true; + + if (count == 0) + return; + + BFD_ASSERT (pending_delete_ops != NULL); + + node = splay_tree_predecessor (pending_delete_ops, addr); + if (node) + { + op = (struct pending_delete_op *)node->value; + if ((bfd_vma)node->key + op->size >= addr) + { + /* The previous op already covers this offset, coalesce the new op + into it. */ + op->size += count; + op->cumulative_offset += count; + need_new_node = false; + } + } + + if (need_new_node) + { + new_op = bfd_zalloc (abfd, sizeof (struct pending_delete_op)); + new_op->size = count; + new_op->cumulative_offset = (op ? op->cumulative_offset : 0) + count; + node = splay_tree_insert (pending_delete_ops, (splay_tree_key)addr, + (splay_tree_value)new_op); + } + + /* Adjust all cumulative offsets after this op. */ + for (node = splay_tree_successor (pending_delete_ops, node->key); + node != NULL; + node = splay_tree_successor (pending_delete_ops, node->key)) + { + op = (struct pending_delete_op *)node->value; + op->cumulative_offset += count; + } +} + +static bool +loongarch_relax_perform_deletes (bfd *abfd, asection *sec, + struct bfd_link_info *link_info) { unsigned int i, symcount; bfd_vma toaddr = sec->size; @@ -4737,30 +4858,73 @@ loongarch_relax_delete_bytes (bfd *abfd, Elf_Internal_Shdr *symtab_hdr = &elf_tdata (abfd)->symtab_hdr; unsigned int sec_shndx = _bfd_elf_section_from_bfd_section (abfd, sec); struct bfd_elf_section_data *data = elf_section_data (sec); - bfd_byte *contents = data->this_hdr.contents; + bfd_byte *contents = data->this_hdr.contents, *contents_end = NULL; struct relr_entry *relr = loongarch_elf_section_data (sec)->relr; struct loongarch_elf_link_hash_table *htab = loongarch_elf_hash_table (link_info); struct relr_entry *relr_end = NULL; + splay_tree pending_delete_ops = htab->pending_delete_ops; + splay_tree_node delete_op_node = NULL, next = NULL; if (htab->relr_count) relr_end = htab->relr + htab->relr_count; - /* Actually delete the bytes. */ - sec->size -= count; - memmove (contents + addr, contents + addr + count, toaddr - addr - count); + BFD_ASSERT (pending_delete_ops != NULL); + delete_op_node = splay_tree_min (pending_delete_ops); + + if (delete_op_node == NULL) + /* No pending delete ops, nothing to do. */ + return true; + + /* Actually delete the bytes. For each delete op the pointer arithmetics look + like this: + + delete_op_node->key -+ +- next->key + |<-- op1->size -->| | + v v v + ...------------------------xxxxxxxxxxxxxxxxxxx---------xxxxxxxxxxx-----... + ^ + delete_op_node->key + op1->size -+ + + For memmove, we need to translate offsets to pointers by adding them to + `contents`. */ + for (; delete_op_node; delete_op_node = next) + { + struct pending_delete_op *op1 + = (struct pending_delete_op *)delete_op_node->value; + bfd_vma op1_start_off = (bfd_vma)delete_op_node->key; + bfd_vma op1_end_off = op1_start_off + op1->size; + next = splay_tree_successor (pending_delete_ops, delete_op_node->key); + bfd_vma op2_start_off = next ? (bfd_vma)next->key : toaddr; + bfd_size_type count = op2_start_off - op1_end_off; + + if (count) + { + if (contents_end == NULL) + /* Start from the end of the first unmodified content chunk. */ + contents_end = contents + op1_start_off; + + memmove (contents_end, contents + op1_end_off, count); + contents_end += count; + } + + /* Adjust the section size once, when we have reached the end. */ + if (next == NULL) + sec->size -= op1->cumulative_offset; + } /* Adjust the location of all of the relocs. Note that we need not adjust the addends, since all PC-relative references must be against symbols, which we will adjust below. */ for (i = 0; i < sec->reloc_count; i++) - if (data->relocs[i].r_offset > addr && data->relocs[i].r_offset < toaddr) - data->relocs[i].r_offset -= count; + if (data->relocs[i].r_offset < toaddr) + data->relocs[i].r_offset = loongarch_calc_relaxed_addr ( + link_info, data->relocs[i].r_offset); /* Likewise for relative relocs to be packed into .relr. */ for (; relr && relr < relr_end && relr->sec == sec; relr++) - if (relr->off > addr && relr->off < toaddr) - relr->off -= count; + if (relr->off < toaddr) + relr->off = loongarch_calc_relaxed_addr (link_info, relr->off); /* Adjust the local symbols defined in this section. */ for (i = 0; i < symtab_hdr->sh_info; i++) @@ -4768,24 +4932,33 @@ loongarch_relax_delete_bytes (bfd *abfd, Elf_Internal_Sym *sym = (Elf_Internal_Sym *) symtab_hdr->contents + i; if (sym->st_shndx == sec_shndx) { - /* If the symbol is in the range of memory we just moved, we - have to adjust its value. */ - if (sym->st_value > addr && sym->st_value <= toaddr) - sym->st_value -= count; + bfd_vma orig_value = sym->st_value; + if (orig_value <= toaddr) + sym->st_value + = loongarch_calc_relaxed_addr (link_info, orig_value); - /* If the symbol *spans* the bytes we just deleted (i.e. its - *end* is in the moved bytes but its *start* isn't), then we - must adjust its size. + /* If the symbol *spans* some deleted bytes (i.e. its *end* is in + the moved bytes but its *start* isn't), then we must adjust its + size. This test needs to use the original value of st_value, otherwise we might accidentally decrease size when deleting bytes right - before the symbol. But since deleted relocs can't span across - symbols, we can't have both a st_value and a st_size decrease, - so it is simpler to just use an else. */ - else if (sym->st_value <= addr - && sym->st_value + sym->st_size > addr - && sym->st_value + sym->st_size <= toaddr) - sym->st_size -= count; + before the symbol. */ + bfd_vma sym_end = orig_value + sym->st_size; + if (sym_end <= toaddr) + { + splay_tree_node node = splay_tree_predecessor ( + pending_delete_ops, (splay_tree_key)sym_end); + if (node) + { + bfd_vma addr = (bfd_vma)node->key; + struct pending_delete_op *op + = (struct pending_delete_op *)node->value; + + if (orig_value <= addr && sym_end > addr) + sym->st_size -= op->size; + } + } } } @@ -4830,16 +5003,29 @@ loongarch_relax_delete_bytes (bfd *abfd, || sym_hash->root.type == bfd_link_hash_defweak) && sym_hash->root.u.def.section == sec) { - /* As above, adjust the value if needed. */ - if (sym_hash->root.u.def.value > addr - && sym_hash->root.u.def.value <= toaddr) - sym_hash->root.u.def.value -= count; + bfd_vma orig_value = sym_hash->root.u.def.value; + + /* As above, adjust the value. */ + if (orig_value <= toaddr) + sym_hash->root.u.def.value + = loongarch_calc_relaxed_addr (link_info, orig_value); /* As above, adjust the size if needed. */ - else if (sym_hash->root.u.def.value <= addr - && sym_hash->root.u.def.value + sym_hash->size > addr - && sym_hash->root.u.def.value + sym_hash->size <= toaddr) - sym_hash->size -= count; + bfd_vma sym_end = orig_value + sym_hash->size; + if (sym_end <= toaddr) + { + splay_tree_node node = splay_tree_predecessor ( + pending_delete_ops, (splay_tree_key)sym_end); + if (node) + { + bfd_vma addr = (bfd_vma)node->key; + struct pending_delete_op *op + = (struct pending_delete_op *)node->value; + + if (orig_value <= addr && sym_end > addr) + sym_hash->size -= op->size; + } + } } } @@ -4919,7 +5105,7 @@ loongarch_tls_perform_trans (bfd *abfd, asection *sec, bfd_put (32, abfd, LARCH_NOP, contents + rel->r_offset); /* link with -relax option will delete NOP. */ if (!info->disable_target_specific_optimizations) - loongarch_relax_delete_bytes (abfd, sec, rel->r_offset, 4, info); + loongarch_relax_delete_bytes (abfd, rel->r_offset, 4, info); return true; case R_LARCH_TLS_IE_PC_HI20: @@ -5008,8 +5194,7 @@ loongarch_tls_perform_trans (bfd *abfd, asection *sec, lu52i.d $rd,$rd,%le64_hi12(sym) => (deleted) */ static bool -loongarch_relax_tls_le (bfd *abfd, asection *sec, - asection *sym_sec ATTRIBUTE_UNUSED, +loongarch_relax_tls_le (bfd *abfd, asection *sec, asection *sym_sec, Elf_Internal_Rela *rel, bfd_vma symval, struct bfd_link_info *link_info, bool *agin ATTRIBUTE_UNUSED, @@ -5019,6 +5204,8 @@ loongarch_relax_tls_le (bfd *abfd, asection *sec, uint32_t insn = bfd_get (32, abfd, contents + rel->r_offset); static uint32_t insn_rj,insn_rd; symval = symval - elf_hash_table (link_info)->tls_sec->vma; + if (sym_sec == sec) + symval = loongarch_calc_relaxed_addr(link_info, symval); /* The old LE instruction sequence can be relaxed when the symbol offset is smaller than the 12-bit range. */ if (symval <= 0xfff) @@ -5033,7 +5220,7 @@ loongarch_relax_tls_le (bfd *abfd, asection *sec, if (symval < 0x800) { rel->r_info = ELFNN_R_INFO (0, R_LARCH_NONE); - loongarch_relax_delete_bytes (abfd, sec, rel->r_offset, + loongarch_relax_delete_bytes (abfd, rel->r_offset, 4, link_info); } break; @@ -5058,7 +5245,7 @@ loongarch_relax_tls_le (bfd *abfd, asection *sec, case R_LARCH_TLS_LE64_LO20: case R_LARCH_TLS_LE64_HI12: rel->r_info = ELFNN_R_INFO (0, R_LARCH_NONE); - loongarch_relax_delete_bytes (abfd, sec, rel->r_offset, + loongarch_relax_delete_bytes (abfd, rel->r_offset, 4, link_info); break; @@ -5116,7 +5303,11 @@ loongarch_relax_pcala_addi (bfd *abfd, asection *sec, asection *sym_sec, size_input_section already took care of updating it after relaxation, so we additionally update once here. */ sec->output_offset = sec->output_section->size; - bfd_vma pc = sec_addr (sec) + rel_hi->r_offset; + bfd_vma pc = sec_addr (sec) + + loongarch_calc_relaxed_addr (info, rel_hi->r_offset); + if (sym_sec == sec) + symval = sec_addr (sec) + + loongarch_calc_relaxed_addr(info, symval - sec_addr (sec)); /* If pc and symbol not in the same segment, add/sub segment alignment. */ if (!loongarch_two_sections_in_same_segment (info->output_bfd, @@ -5155,7 +5346,7 @@ loongarch_relax_pcala_addi (bfd *abfd, asection *sec, asection *sym_sec, R_LARCH_PCREL20_S2); rel_lo->r_info = ELFNN_R_INFO (0, R_LARCH_NONE); - loongarch_relax_delete_bytes (abfd, sec, rel_lo->r_offset, 4, info); + loongarch_relax_delete_bytes (abfd, rel_lo->r_offset, 4, info); return true; } @@ -5177,7 +5368,11 @@ loongarch_relax_call36 (bfd *abfd, asection *sec, asection *sym_sec, size_input_section already took care of updating it after relaxation, so we additionally update once here. */ sec->output_offset = sec->output_section->size; - bfd_vma pc = sec_addr (sec) + rel->r_offset; + bfd_vma pc = sec_addr (sec) + + loongarch_calc_relaxed_addr (info, rel->r_offset); + if (sym_sec == sec) + symval = sec_addr (sec) + + loongarch_calc_relaxed_addr(info, symval - sec_addr (sec)); /* If pc and symbol not in the same segment, add/sub segment alignment. */ if (!loongarch_two_sections_in_same_segment (info->output_bfd, @@ -5211,7 +5406,7 @@ loongarch_relax_call36 (bfd *abfd, asection *sec, asection *sym_sec, /* Adjust relocations. */ rel->r_info = ELFNN_R_INFO (ELFNN_R_SYM (rel->r_info), R_LARCH_B26); /* Delete jirl instruction. */ - loongarch_relax_delete_bytes (abfd, sec, rel->r_offset + 4, 4, info); + loongarch_relax_delete_bytes (abfd, rel->r_offset + 4, 4, info); return true; } @@ -5237,7 +5432,11 @@ loongarch_relax_pcala_ld (bfd *abfd, asection *sec, size_input_section already took care of updating it after relaxation, so we additionally update once here. */ sec->output_offset = sec->output_section->size; - bfd_vma pc = sec_addr (sec) + rel_hi->r_offset; + bfd_vma pc = sec_addr (sec) + + loongarch_calc_relaxed_addr(info, rel_hi->r_offset); + if (sym_sec == sec) + symval = sec_addr (sec) + + loongarch_calc_relaxed_addr(info, symval - sec_addr (sec)); /* If pc and symbol not in the same segment, add/sub segment alignment. */ if (!loongarch_two_sections_in_same_segment (info->output_bfd, @@ -5287,7 +5486,7 @@ bfd_elfNN_loongarch_set_data_segment_info (struct bfd_link_info *info, static bool loongarch_relax_align (bfd *abfd, asection *sec, asection *sym_sec, Elf_Internal_Rela *rel, - bfd_vma symval ATTRIBUTE_UNUSED, + bfd_vma symval, struct bfd_link_info *link_info, bool *again ATTRIBUTE_UNUSED, bfd_vma max_alignment ATTRIBUTE_UNUSED) @@ -5303,6 +5502,10 @@ loongarch_relax_align (bfd *abfd, asection *sec, asection *sym_sec, else alignment = rel->r_addend + 4; + if (sym_sec == sec) + symval = sec_addr (sec) + + loongarch_calc_relaxed_addr (link_info, symval - sec_addr (sec)); + addend = alignment - 4; /* The bytes of NOPs added by R_LARCH_ALIGN. */ symval -= addend; /* The address of first NOP added by R_LARCH_ALIGN. */ bfd_vma aligned_addr = ((symval - 1) & ~(alignment - 1)) + alignment; @@ -5328,17 +5531,19 @@ loongarch_relax_align (bfd *abfd, asection *sec, asection *sym_sec, /* If skipping more bytes than the specified maximum, then the alignment is not done at all and delete all NOPs. */ if (max > 0 && need_nop_bytes > max) - return loongarch_relax_delete_bytes (abfd, sec, rel->r_offset, - addend, link_info); + { + loongarch_relax_delete_bytes (abfd, rel->r_offset, addend, link_info); + return true; + } /* If the number of NOPs is already correct, there's nothing to do. */ if (need_nop_bytes == addend) return true; /* Delete the excess NOPs. */ - return loongarch_relax_delete_bytes (abfd, sec, - rel->r_offset + need_nop_bytes, - addend - need_nop_bytes, link_info); + loongarch_relax_delete_bytes (abfd, rel->r_offset + need_nop_bytes, + addend - need_nop_bytes, link_info); + return true; } /* Relax pcalau12i + addi.d of TLS LD/GD/DESC to pcaddi. */ @@ -5359,7 +5564,11 @@ loongarch_relax_tls_ld_gd_desc (bfd *abfd, asection *sec, asection *sym_sec, size_input_section already took care of updating it after relaxation, so we additionally update once here. */ sec->output_offset = sec->output_section->size; - bfd_vma pc = sec_addr (sec) + rel_hi->r_offset; + bfd_vma pc = sec_addr (sec) + + loongarch_calc_relaxed_addr (info, rel_hi->r_offset); + if (sym_sec == sec) + symval = sec_addr (sec) + + loongarch_calc_relaxed_addr (info, symval - sec_addr (sec)); /* If pc and symbol not in the same segment, add/sub segment alignment. */ if (!loongarch_two_sections_in_same_segment (info->output_bfd, @@ -5414,7 +5623,7 @@ loongarch_relax_tls_ld_gd_desc (bfd *abfd, asection *sec, asection *sym_sec, } rel_lo->r_info = ELFNN_R_INFO (0, R_LARCH_NONE); - loongarch_relax_delete_bytes (abfd, sec, rel_lo->r_offset, 4, info); + loongarch_relax_delete_bytes (abfd, rel_lo->r_offset, 4, info); return true; } @@ -5502,6 +5711,9 @@ loongarch_elf_relax_section (bfd *abfd, asection *sec, htab->max_alignment = max_alignment; } + splay_tree pending_delete_ops = pending_delete_ops_new (); + htab->pending_delete_ops = pending_delete_ops; + for (unsigned int i = 0; i < sec->reloc_count; i++) { char symtype; @@ -5737,6 +5949,10 @@ loongarch_elf_relax_section (bfd *abfd, asection *sec, info, again, max_alignment); } + loongarch_relax_perform_deletes (abfd, sec, info); + htab->pending_delete_ops = NULL; + splay_tree_delete (pending_delete_ops); + return true; } -- 2.48.1
- Previous message (by thread): Inclusion of Custom NOLOAD Section in pei-x86-64 Output Binary
- Next message (by thread): [PATCH] LoongArch: Batch-delete bytes at the end of each relax trip
- Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the Binutils mailing list