#!/usr/bin/gawk -f # Example usage to download the files, preserving time and following redirs: # curl -o 752542.html https://bugs.launchpad.net/lpbugreporter/+bug/752542 && # ./get-urls.awk 752542.html | # awk 'NR<=50{print "-O", $0}' | (cd /tmp/lp/dl/ && time xargs curl -qvRL) BEGIN { FS="\"" delete entries } /portlet-attachments/ { p = 1} # The few .zip and .tar and .tar.bz2 did not seem interesting. Also ignore plain # .dsl{,.gz,.bz2}, .txt and .dat files. p && /+files\/.*\.tar\.gz"/ { url = $2 split(url, parts, "/") name = parts[10] id = parts[8] #print url; # Overwrite older names with newer submissions. entries[name] = url ids[id] = name } END { # Sort by attachment ID asorti(ids, ids_sorted) # Print attachments in reverse (newest first) for (i = length(ids_sorted); i > 0; i--) { id = ids_sorted[i] name = ids[id] if (entries[name]) { url = entries[name] print url # Do not print duplicates delete entries[name] } } }