Buildroot Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [Buildroot] [PATCH 1/1] ppd-merge: speed up per-package-rsync
@ 2023-11-27 22:41 Brandon Maier via buildroot
  2023-11-28 15:49 ` Herve Codina via buildroot
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Brandon Maier via buildroot @ 2023-11-27 22:41 UTC (permalink / raw)
  To: buildroot; +Cc: Herve Codina, Yann E . MORIN, Brandon Maier

The per-package-rsync stage can add a significant amount of time to
builds. They can also be annoying as the slowest rsyncs, the final
target-finalize and host-finalize stages, run on `make all` which we use
frequently during development.

The per-package-rsync is slow because it launches a new rsync for each
source tree, and each rsync must rescan the destination directory and
potentially overwrite files multiple times. So instead we manually walk
the source trees to find only the files that should be written to the
destination, then feed that to a single instance of rsync.

Below is a benchmark running just the host-finalize merge for a system
with 200 host packages, running in both hardlink and full copy mode.

Benchmark 1: ppd-merge-copy
  Time (mean ± σ):      5.332 s ±  0.098 s    [User: 5.300 s, System: 3.926 s]
  Range (min … max):    5.099 s …  5.468 s    10 runs

Benchmark 2: ppd-merge-hardlink
  Time (mean ± σ):      2.067 s ±  0.027 s    [User: 1.218 s, System: 1.614 s]
  Range (min … max):    2.037 s …  2.114 s    10 runs

Benchmark 3: rsync-copy
  Time (mean ± σ):     25.539 s ±  0.233 s    [User: 11.705 s, System: 10.862 s]
  Range (min … max):   25.215 s … 25.966 s    10 runs

Benchmark 4: rsync-hardlink
  Time (mean ± σ):     24.074 s ±  0.271 s    [User: 3.451 s, System: 10.802 s]
  Range (min … max):   23.672 s … 24.522 s    10 runs

Summary
  'ppd-merge-hardlink' ran
    2.58 ± 0.06 times faster than 'ppd-merge-copy'
   11.65 ± 0.20 times faster than 'rsync-hardlink'
   12.36 ± 0.20 times faster than 'rsync-copy'

Signed-off-by: Brandon Maier <brandon.maier@collins.com>
---
 package/pkg-utils.mk         |  16 +---
 support/scripts/ppd-merge.sh | 154 +++++++++++++++++++++++++++++++++++
 2 files changed, 157 insertions(+), 13 deletions(-)
 create mode 100755 support/scripts/ppd-merge.sh

diff --git a/package/pkg-utils.mk b/package/pkg-utils.mk
index 059e86ae0a..3968cabebd 100644
--- a/package/pkg-utils.mk
+++ b/package/pkg-utils.mk
@@ -216,19 +216,9 @@ ifeq ($(BR2_PER_PACKAGE_DIRECTORIES),y)
 # $3: destination directory
 # $4: literal "copy" or "hardlink" to copy or hardlink files from src to dest
 define per-package-rsync
-	mkdir -p $(3)
-	$(foreach pkg,$(1),\
-		rsync -a \
-			--hard-links \
-			$(if $(filter hardlink,$(4)), \
-				--link-dest=$(PER_PACKAGE_DIR)/$(pkg)/$(2)/, \
-				$(if $(filter copy,$(4)), \
-					$(empty), \
-					$(error per-package-rsync can only "copy" or "hardlink", not "$(4)") \
-				) \
-			) \
-			$(PER_PACKAGE_DIR)/$(pkg)/$(2)/ \
-			$(3)$(sep))
+	PER_PACKAGE_DIR=$(PER_PACKAGE_DIR) \
+		$(TOPDIR)/support/scripts/ppd-merge.sh \
+		$(2) $(3) $(4) $(1)
 endef
 
 # prepares the per-package HOST_DIR and TARGET_DIR of the current
diff --git a/support/scripts/ppd-merge.sh b/support/scripts/ppd-merge.sh
new file mode 100755
index 0000000000..eb1caf9e52
--- /dev/null
+++ b/support/scripts/ppd-merge.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+# Merge a set of Buildroot per-package-directories (PPD) into a single
+# destination directory.
+#
+# ppd-merge scans through all the PPD and builds a table of which files from
+# each source directory will be written to the destination directory. It feeds
+# that table into rsync to tell it exactly which files to copy.
+#
+# Example:
+#   ppd-merge replaces the original method of merging PPD that ran many rsync
+#   commands like below.
+#
+#   > rsync -a --link-dest=$PER_PACKAGE_DIR/pkg-1/host/ \
+#   >     $PER_PACKAGE_DIR/pkg-1/host/ dest/
+#   > rsync -a --link-dest=$PER_PACKAGE_DIR/pkg-2/host/ \
+#   >     $PER_PACKAGE_DIR/pkg-2/host/ dest/
+#   ...
+#   > rsync -a --link-dest=$PER_PACKAGE_DIR/pkg-n/host/ \
+#   >     $PER_PACKAGE_DIR/pkg-n/host/ dest/
+#
+#   The equivalent command with ppd-merge is
+#
+#   > PER_PACKAGE_DIR=$PER_PACKAGE_DIR \
+#   >     ppd-merge.sh host dest/ hardlink \
+#   >     pkg-1 pkg-2 ... pkg-n
+
+set -euo pipefail
+
+usage() {
+    cat <<EOF >&2
+Usage:
+  ppd-merge.sh <host|target> <destination> <hardlink|copy> <pkg-name>...
+
+  PER_PACKAGE_DIR must be defined in the environment.
+
+  Merges each directory for <pkg-name> at
+  "\$PER_PACKAGE_DIR/<pkg-name>/<host|target>/" into <destination>.
+EOF
+    echo "Error: $*" >&2
+    exit 1
+}
+
+search_subtrees() {
+    # Use `find` to walk all subtrees and print their contents in `rsync`
+    # '--relative' path syntax. Filter paths through awk so we discard any
+    # duplicate files found while walking subtrees.
+    (cd "$PER_PACKAGE_DIR" && find "${subtrees[@]}" -mindepth 1 -printf "%H/./%P\0") \
+        | awk '
+        BEGIN {
+            RS="\0";
+            ORS="\0";
+            FS="/\\./";
+        }
+        {
+            if (!($2 in found)) {
+                found[$2]=1;
+                print;
+            }
+        }'
+}
+
+rsync_hardlink() {
+    # rsync hardlinking with --link-dest hard caps at 20 subtrees. Batch up the
+    # input files in $tmpdir until we have 20 subtrees, then flush them with an
+    # rsync call.
+    tmpdir=$(mktemp -d)
+    awk -v tmpdir="$tmpdir" '
+        function mkfiles() {
+            out_file=tmpdir "/" i ".files";
+            subtrees_file=tmpdir "/" i ".subtrees";
+        }
+        function batch_out() {
+            close(out_file);
+            close(subtrees_file);
+            delete subtrees;
+            print i;
+            i+=1;
+            mkfiles();
+        }
+        BEGIN {
+            RS="\0";
+            ORS="\0";
+            FS="/\\./";
+            i=0;
+            mkfiles();
+        }
+        !($1 in subtrees) && (length(subtrees) >= 20) {
+            batch_out();
+        }
+        {
+            print > out_file;
+            if (!($1 in subtrees)) {
+                subtrees[$1]=1;
+                print $1 > subtrees_file;
+            }
+        }
+        END {
+            if (length(subtrees) > 0) {
+                batch_out();
+            }
+        }' | while IFS= read -r -d $'\0' i; do
+            rsync_cmd_batch=( "${rsync_cmd[@]}" )
+            while IFS= read -r -d $'\0' subtree; do
+                rsync_cmd_batch+=( --link-dest="$PER_PACKAGE_DIR/$subtree/" )
+            done <"$tmpdir/$i.subtrees"
+            "${rsync_cmd_batch[@]}" "$PER_PACKAGE_DIR/" "$dest/" <"$tmpdir/$i.files"
+        done
+    rm -rf "$tmpdir"
+}
+
+if [ "$#" -lt 3 ]; then
+    usage "Invalid number of arguments"
+fi
+type=$1
+dest=$2
+link=$3
+shift 3
+
+case "$type" in
+host|target)
+    ;;
+*)
+    usage "Unknown type '$type'"
+    ;;
+esac
+
+subtrees=()
+for ((i = $#; i > 0; i--)); do
+    subtrees+=( "${!i}/$type" )
+done
+
+if [ -z "${PER_PACKAGE_DIR:-}" ]; then
+    usage "PER_PACKAGE_DIR must be defined"
+fi
+
+rsync_cmd=( rsync -a --hard-links --files-from=- --from0 )
+
+mkdir -p "$dest"
+
+if [ "${#subtrees[@]}" -eq 0 ]; then
+    exit 0
+fi
+
+case "$link" in
+hardlink)
+    search_subtrees | rsync_hardlink
+    ;;
+copy)
+    search_subtrees | "${rsync_cmd[@]}" "$PER_PACKAGE_DIR/" "$dest/"
+    ;;
+*)
+    usage "Unknown link command '$link'"
+    ;;
+esac
-- 
2.43.0

_______________________________________________
buildroot mailing list
buildroot@buildroot.org
https://lists.buildroot.org/mailman/listinfo/buildroot

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2023-12-04 12:56 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-11-27 22:41 [Buildroot] [PATCH 1/1] ppd-merge: speed up per-package-rsync Brandon Maier via buildroot
2023-11-28 15:49 ` Herve Codina via buildroot
2023-11-28 17:21   ` [Buildroot] [External] " Maier, Brandon L Collins via buildroot
2023-11-28 17:54     ` Herve Codina via buildroot
2023-11-28 18:04       ` Maier, Brandon L Collins via buildroot
2023-11-28 21:07 ` [Buildroot] " Yann E. MORIN
2023-11-29  0:00   ` [Buildroot] [External] " Maier, Brandon L Collins via buildroot
2023-11-29  8:17   ` [Buildroot] " Herve Codina via buildroot
2023-12-01 17:05 ` [Buildroot] [PATCH v2 " Brandon Maier via buildroot
2023-12-04 12:46   ` Herve Codina via buildroot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox