#!/usr/bin/gawk -f
# Example usage to download the files, preserving time and following redirs:
# curl -o 752542.html https://bugs.launchpad.net/lpbugreporter/+bug/752542 &&
# ./get-urls.awk 752542.html |
# awk 'NR<=50{print "-O", $0}' | (cd /tmp/lp/dl/ && time xargs curl -qvRL)
BEGIN {
        FS="\""
        delete entries
}
/portlet-attachments/ { p = 1}
# The few .zip and .tar and .tar.bz2 did not seem interesting. Also ignore plain
# .dsl{,.gz,.bz2}, .txt and .dat files.
p && /+files\/.*\.tar\.gz"/ {
        url = $2
        split(url, parts, "/")
        name = parts[10]
        id = parts[8]

        #print url;
        # Overwrite older names with newer submissions.
        entries[name] = url
        ids[id] = name
}

END {
        # Sort by attachment ID
        asorti(ids, ids_sorted)
        # Print attachments in reverse (newest first)
        for (i = length(ids_sorted); i > 0; i--) {
                id = ids_sorted[i]
                name = ids[id]
                if (entries[name]) {
                        url = entries[name]
                        print url
                        # Do not print duplicates
                        delete entries[name]
                }
        }
}