File: //usr/lib/dracut/modules.d/99base/memtrace-ko.sh
#!/bin/sh
# Try to find out kernel modules with large total memory allocation during loading.
# For large slab allocation, it will fall into buddy, also not trace "mm_page_free"
# considering large free is quite rare for module_init, thus saving tons of events
# to avoid trace data overwritten.
#
# Therefore, tracing "mm_page_alloc"alone should be enough for the purpose.
# "sys/kernel/tracing" has the priority if exists.
get_trace_base() {
    # trace access through debugfs would be obsolete if "/sys/kernel/tracing" is available.
    if [ -d "/sys/kernel/tracing" ]; then
        echo "/sys/kernel"
    else
        echo "/sys/kernel/debug"
    fi
}
# We want to enable these trace events.
get_want_events() {
    echo "module:module_put module:module_load kmem:mm_page_alloc"
}
get_event_filter() {
    echo "comm == systemd-udevd || comm == modprobe || comm == insmod"
}
is_trace_ready() {
    local trace_base want_events current_events
    trace_base=$(get_trace_base)
    ! [ -f "$trace_base/tracing/trace" ] && return 1
    [ "$(cat $trace_base/tracing/tracing_on)" -eq 0 ] && return 1
    # Also check if trace events were properly setup.
    want_events=$(get_want_events)
    current_events=$(echo $(cat $trace_base/tracing/set_event))
    [ "$current_events" != "$want_events" ] && return 1
    return 0
}
prepare_trace() {
    local trace_base
    trace_base=$(get_trace_base)
    # old debugfs interface case.
    if ! [ -d "$trace_base/tracing" ]; then
        mount none -t debugfs $trace_base
    # new tracefs interface case.
    elif ! [ -f "$trace_base/tracing/trace" ]; then
        mount none -t tracefs "$trace_base/tracing"
    fi
    if ! [ -f "$trace_base/tracing/trace" ]; then
        echo "WARN: Mount trace failed for kernel module memory analyzing."
        return 1
    fi
    # Active all the wanted trace events.
    echo "$(get_want_events)" > $trace_base/tracing/set_event
    # There are three kinds of known applications for module loading:
    # "systemd-udevd", "modprobe" and "insmod".
    # Set them as the global events filter.
    # NOTE: Some kernel may not support this format of filter, anyway
    #       the operation will fail and it doesn't matter.
    echo "$(get_event_filter)" > $trace_base/tracing/events/kmem/filter 2>&1
    echo "$(get_event_filter)" > $trace_base/tracing/events/module/filter 2>&1
    # Set the number of comm-pid if supported.
    if [ -f "$trace_base/tracing/saved_cmdlines_size" ]; then
        # Thanks to filters, 4096 is big enough(also well supported).
        echo 4096 > $trace_base/tracing/saved_cmdlines_size
    fi
    # Enable and clear trace data for the first time.
    echo 1 > $trace_base/tracing/tracing_on
    echo > $trace_base/tracing/trace
    echo "Prepare trace success."
    return 0
}
order_to_pages()
{
    local pages=1
    local order=$1
    while [ "$order" != 0 ]; do
        order=$((order-1))
        pages=$(($pages*2))
	done
    echo $pages
}
parse_trace_data() {
    local module_name tmp_eval pages
    cat "$(get_trace_base)/tracing/trace" | while read pid cpu flags ts function args
    do
        # Skip comment lines
        if [ "$pid" = "#" ]; then
            continue
        fi
        pid=${pid##*-}
        function=${function%:}
        if [ "$function" = "module_load" ]; then
            # One module is being loaded, save the task pid for tracking.
            # Remove the trailing after whitespace, there may be the module flags.
            module_name=${args%% *}
            # Mark current_module to track the task.
            eval current_module_$pid="$module_name"
            tmp_eval=$(eval echo '${module_loaded_'${module_name}'}')
            if [ -n "$tmp_eval" ]; then
                echo "WARN: \"$module_name\" was loaded multiple times!"
            fi
            eval unset module_loaded_$module_name
            eval nr_alloc_pages_$module_name=0
            continue
        fi
        module_name=$(eval echo '${current_module_'${pid}'}')
        if [ -z "$module_name" ]; then
            continue
        fi
        # Once we get here, the task is being tracked(is loading a module).
        if [ "$function" = "module_put" ]; then
            # Mark the module as loaded when the first module_put event happens after module_load.
            tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}')
            echo "$tmp_eval pages consumed by \"$module_name\""
            eval module_loaded_$module_name=1
            # Module loading finished, so untrack the task.
            eval unset current_module_$pid
            eval unset nr_alloc_pages_$module_name
            continue
        fi
        if [ "$function" = "mm_page_alloc" ]; then
            # Get order first, then convert to actual pages.
            pages=$(echo $args | sed -e 's/.*order=\([0-9]*\) .*/\1/')
            pages=$(order_to_pages "$pages")
            tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}')
            eval nr_alloc_pages_$module_name="$(($tmp_eval+$pages))"
        fi
    done
}
cleanup_trace() {
    local trace_base
    if is_trace_ready; then
        trace_base=$(get_trace_base)
        echo 0 > $trace_base/tracing/tracing_on
        echo > $trace_base/tracing/trace
        echo > $trace_base/tracing/set_event
        echo 0 > $trace_base/tracing/events/kmem/filter
        echo 0 > $trace_base/tracing/events/module/filter
    fi
}
show_usage() {
    echo "Find out kernel modules with large memory consumption during loading based on trace."
    echo "Usage:"
    echo "1) run it first to setup trace."
    echo "2) run again to parse the trace data if any."
    echo "3) run with \"--cleanup\" option to cleanup trace after use."
}
if [ "$1" = "--help" ]; then
    show_usage
    exit 0
fi
if [ "$1" = "--cleanup" ]; then
    cleanup_trace
    exit 0
fi
if is_trace_ready ; then
    echo "tracekomem - Rough memory consumption by loading kernel modules (larger value with better accuracy)"
    parse_trace_data
else
    prepare_trace
fi
exit $?