ferp Public

Watch 0 Fork 1 Star 0

Bash · 17232 bytes Raw Blame History

  
        1
        #!/usr/bin/env bash
      
        2
        #
      
        3
        # FERP vs grep Benchmark Suite
      
        4
        # Comprehensive performance comparison
      
        5
        #
      
        6
        # Requires: bash 4+, bc, python3 (for timing)
      
        7
        #
      
        8
        
        9
        set -e
      
        10
        
        11
        # Colors for output
      
        12
        RED='\033[0;31m'
      
        13
        GREEN='\033[0;32m'
      
        14
        YELLOW='\033[1;33m'
      
        15
        BLUE='\033[0;34m'
      
        16
        CYAN='\033[0;36m'
      
        17
        BOLD='\033[1m'
      
        18
        NC='\033[0m' # No Color
      
        19
        
        20
        # Configuration
      
        21
        BENCH_DIR="/tmp/ferp_benchmark_$$"
      
        22
        FERP="./ferp"
      
        23
        GREP="grep"
      
        24
        RUNS=3  # Number of runs per benchmark (take median)
      
        25
        
        26
        # Test file sizes
      
        27
        SMALL_LINES=10000      # ~700KB
      
        28
        MEDIUM_LINES=100000    # ~7MB
      
        29
        LARGE_LINES=1000000    # ~70MB
      
        30
        
        31
        # Results storage (simple arrays for portability)
      
        32
        RESULT_NAMES=()
      
        33
        RESULT_FERP_TIMES=()
      
        34
        RESULT_GREP_TIMES=()
      
        35
        
        36
        #------------------------------------------------------------------------------
      
        37
        # Utility Functions
      
        38
        #------------------------------------------------------------------------------
      
        39
        
        40
        cleanup() {
      
        41
            echo -e "\n${CYAN}Cleaning up...${NC}"
      
        42
            rm -rf "$BENCH_DIR"
      
        43
        }
      
        44
        
        45
        trap cleanup EXIT
      
        46
        
        47
        die() {
      
        48
            echo -e "${RED}ERROR: $1${NC}" >&2
      
        49
            exit 1
      
        50
        }
      
        51
        
        52
        check_prerequisites() {
      
        53
            echo -e "${CYAN}Checking prerequisites...${NC}"
      
        54
        
        55
            # Check ferp exists
      
        56
            if [[ ! -x "$FERP" ]]; then
      
        57
                echo -e "${YELLOW}Building ferp (release mode)...${NC}"
      
        58
                make release >/dev/null 2>&1 || die "Failed to build ferp"
      
        59
            fi
      
        60
        
        61
            # Verify ferp works
      
        62
            echo "test" | $FERP "test" >/dev/null 2>&1 || die "ferp not working"
      
        63
        
        64
            # Check grep exists
      
        65
            command -v $GREP >/dev/null 2>&1 || die "grep not found"
      
        66
        
        67
            echo -e "${GREEN}Prerequisites OK${NC}"
      
        68
        }
      
        69
        
        70
        create_test_files() {
      
        71
            echo -e "\n${CYAN}Creating test files in $BENCH_DIR...${NC}"
      
        72
            mkdir -p "$BENCH_DIR"
      
        73
        
        74
            # File 1: English-like text (varied content) - use awk for speed
      
        75
            echo -e "  Creating english text file ($LARGE_LINES lines)..."
      
        76
            awk -v n="$LARGE_LINES" 'BEGIN {
      
        77
                lines[0] = "The quick brown fox jumps over the lazy dog near the riverbank."
      
        78
                lines[1] = "Hello world, this is line number %d of the benchmark test file."
      
        79
                lines[2] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit sed do."
      
        80
                lines[3] = "Error: connection timeout after 30000ms on server node-%d."
      
        81
                lines[4] = "DEBUG [2024-01-15 10:23:45] Processing request id=%d status=pending"
      
        82
                lines[5] = "user@example.com logged in from 192.168.1.%d at 12:00:00"
      
        83
                lines[6] = "WARNING: disk usage at %d%% on /dev/sda1 partition"
      
        84
                lines[7] = "Function calculate_total(items=[1,2,3]) returned value=%d"
      
        85
                lines[8] = "The API endpoint /api/v2/users/%d responded with HTTP 200 OK"
      
        86
                lines[9] = "Configuration: max_threads=16, timeout=5000, retry_count=3"
      
        87
                for (i = 1; i <= n; i++) {
      
        88
                    idx = i % 10
      
        89
                    if (idx == 0 || idx == 2 || idx == 9) {
      
        90
                        print lines[idx]
      
        91
                    } else if (idx == 3) {
      
        92
                        printf lines[idx] "\n", i % 100
      
        93
                    } else if (idx == 5) {
      
        94
                        printf lines[idx] "\n", i % 256
      
        95
                    } else if (idx == 6) {
      
        96
                        printf lines[idx] "\n", 50 + (i % 50)
      
        97
                    } else if (idx == 7) {
      
        98
                        printf lines[idx] "\n", i * 42
      
        99
                    } else {
      
        100
                        printf lines[idx] "\n", i
      
        101
                    }
      
        102
                }
      
        103
            }' > "$BENCH_DIR/english_large.txt"
      
        104
        
        105
            # File 2: Log-like file (structured)
      
        106
            echo -e "  Creating log file ($MEDIUM_LINES lines)..."
      
        107
            awk -v n="$MEDIUM_LINES" 'BEGIN {
      
        108
                levels[0] = "INFO"; levels[1] = "DEBUG"; levels[2] = "WARN"; levels[3] = "ERROR"
      
        109
                for (i = 1; i <= n; i++) {
      
        110
                    day = 1 + (i % 28)
      
        111
                    hour = i % 24
      
        112
                    min = i % 60
      
        113
                    sec = i % 60
      
        114
                    comp = i % 20
      
        115
                    printf "[2024-01-%02d %02d:%02d:%02d] %s: Message number %d from component-%d\n", \
      
        116
                           day, hour, min, sec, levels[i % 4], i, comp
      
        117
                }
      
        118
            }' > "$BENCH_DIR/logs_medium.txt"
      
        119
        
        120
            # File 3: Code-like file
      
        121
            echo -e "  Creating code file ($MEDIUM_LINES lines)..."
      
        122
            awk -v n="$MEDIUM_LINES" 'BEGIN {
      
        123
                for (i = 1; i <= n; i++) {
      
        124
                    idx = i % 8
      
        125
                    if (idx == 0) printf "function process_data_%d(input) {\n", i
      
        126
                    else if (idx == 1) print "    const result = input.map(x => x * 2);"
      
        127
                    else if (idx == 2) print "    if (result.length > 0) {"
      
        128
                    else if (idx == 3) print "        console.log(\"Processing:\", result);"
      
        129
                    else if (idx == 4) print "        return result.filter(x => x > 10);"
      
        130
                    else if (idx == 5) print "    }"
      
        131
                    else if (idx == 6) print "    return [];"
      
        132
                    else print "}"
      
        133
                }
      
        134
            }' > "$BENCH_DIR/code_medium.txt"
      
        135
        
        136
            # File 4: CSV-like data
      
        137
            echo -e "  Creating CSV file ($MEDIUM_LINES lines)..."
      
        138
            awk -v n="$MEDIUM_LINES" 'BEGIN {
      
        139
                print "id,name,email,score,timestamp"
      
        140
                srand()
      
        141
                for (i = 1; i <= n; i++) {
      
        142
                    score = int(rand() * 100)
      
        143
                    printf "%d,user_%d,user%d@domain%d.com,%d,%d\n", \
      
        144
                           i, i, i, i % 100, score, 1700000000 + i
      
        145
                }
      
        146
            }' > "$BENCH_DIR/data_medium.csv"
      
        147
        
        148
            # File 5: Small file for quick tests
      
        149
            echo -e "  Creating small file ($SMALL_LINES lines)..."
      
        150
            head -n $SMALL_LINES "$BENCH_DIR/english_large.txt" > "$BENCH_DIR/english_small.txt"
      
        151
        
        152
            # Print file sizes
      
        153
            echo -e "\n${CYAN}Test files created:${NC}"
      
        154
            ls -lh "$BENCH_DIR"/*.txt "$BENCH_DIR"/*.csv 2>/dev/null | awk '{print "  " $9 ": " $5}'
      
        155
        }
      
        156
        
        157
        #------------------------------------------------------------------------------
      
        158
        # Benchmark Functions
      
        159
        #------------------------------------------------------------------------------
      
        160
        
        161
        # Run a command multiple times and return median time
      
        162
        run_timed() {
      
        163
            local cmd="$1"
      
        164
            local times=()
      
        165
        
        166
            for i in $(seq 1 $RUNS); do
      
        167
                # Use /usr/bin/time for portable timing
      
        168
                local t=$( { time eval "$cmd" >/dev/null 2>&1; } 2>&1 | grep real | sed 's/real[[:space:]]*//' )
      
        169
                # Convert to seconds (handles both 0m0.123s and 0.123 formats)
      
        170
                if [[ "$t" =~ ([0-9]+)m([0-9.]+)s ]]; then
      
        171
                    local mins="${BASH_REMATCH[1]}"
      
        172
                    local secs="${BASH_REMATCH[2]}"
      
        173
                    t=$(echo "$mins * 60 + $secs" | bc -l)
      
        174
                elif [[ "$t" =~ ^[0-9.]+$ ]]; then
      
        175
                    : # already in seconds
      
        176
                else
      
        177
                    t="999"  # Error case
      
        178
                fi
      
        179
                times+=("$t")
      
        180
            done
      
        181
        
        182
            # Return median (sort and take middle)
      
        183
            printf '%s\n' "${times[@]}" | sort -n | sed -n "$((($RUNS + 1) / 2))p"
      
        184
        }
      
        185
        
        186
        # Alternative timing using date (more portable)
      
        187
        run_timed_portable() {
      
        188
            local cmd="$1"
      
        189
            local times=()
      
        190
        
        191
            for i in $(seq 1 $RUNS); do
      
        192
                local start=$(python3 -c 'import time; print(time.time())' 2>/dev/null || date +%s.%N)
      
        193
                eval "$cmd" >/dev/null 2>&1
      
        194
                local end=$(python3 -c 'import time; print(time.time())' 2>/dev/null || date +%s.%N)
      
        195
                local t=$(echo "$end - $start" | bc -l)
      
        196
                times+=("$t")
      
        197
            done
      
        198
        
        199
            # Return median
      
        200
            printf '%s\n' "${times[@]}" | sort -n | sed -n "$((($RUNS + 1) / 2))p"
      
        201
        }
      
        202
        
        203
        benchmark_pattern() {
      
        204
            local name="$1"
      
        205
            local file="$2"
      
        206
            local ferp_args="$3"
      
        207
            local grep_args="$4"
      
        208
            local pattern="$5"
      
        209
        
        210
            printf "  %-35s" "$name"
      
        211
        
        212
            # Run ferp
      
        213
            local ferp_time=$(run_timed_portable "$FERP $ferp_args '$pattern' '$file'")
      
        214
        
        215
            # Run grep
      
        216
            local grep_time=$(run_timed_portable "$GREP $grep_args '$pattern' '$file'")
      
        217
        
        218
            # Calculate speedup
      
        219
            local speedup=$(echo "scale=2; $grep_time / $ferp_time" | bc -l 2>/dev/null || echo "N/A")
      
        220
        
        221
            # Store results
      
        222
            RESULT_NAMES+=("$name")
      
        223
            RESULT_FERP_TIMES+=("$ferp_time")
      
        224
            RESULT_GREP_TIMES+=("$grep_time")
      
        225
        
        226
            # Color-code the speedup
      
        227
            local color="$NC"
      
        228
            if (( $(echo "$speedup > 1.5" | bc -l) )); then
      
        229
                color="$GREEN"
      
        230
            elif (( $(echo "$speedup < 0.8" | bc -l) )); then
      
        231
                color="$RED"
      
        232
            fi
      
        233
        
        234
            printf "ferp: %6.3fs  grep: %6.3fs  ${color}%5.2fx${NC}\n" "$ferp_time" "$grep_time" "$speedup"
      
        235
        }
      
        236
        
        237
        #------------------------------------------------------------------------------
      
        238
        # Benchmark Suites
      
        239
        #------------------------------------------------------------------------------
      
        240
        
        241
        run_literal_benchmarks() {
      
        242
            echo -e "\n${BOLD}${BLUE}=== Literal String Matching ===${NC}"
      
        243
            local file="$BENCH_DIR/english_large.txt"
      
        244
        
        245
            benchmark_pattern "Simple word (hello)" "$file" "" "" "hello"
      
        246
            benchmark_pattern "Common word (the)" "$file" "" "" "the"
      
        247
            benchmark_pattern "Longer phrase (quick brown)" "$file" "" "" "quick brown"
      
        248
            benchmark_pattern "Case insensitive (-i hello)" "$file" "-i" "-i" "hello"
      
        249
            benchmark_pattern "Fixed string (-F hello)" "$file" "-F" "-F" "hello"
      
        250
            benchmark_pattern "Word boundary (-w the)" "$file" "-w" "-w" "the"
      
        251
        }
      
        252
        
        253
        run_regex_benchmarks() {
      
        254
            echo -e "\n${BOLD}${BLUE}=== Regular Expression Matching ===${NC}"
      
        255
            local file="$BENCH_DIR/english_large.txt"
      
        256
        
        257
            benchmark_pattern "Dot wildcard (h.llo)" "$file" "" "" "h.llo"
      
        258
            benchmark_pattern "Star quantifier (hel*o)" "$file" "" "" "hel*o"
      
        259
            benchmark_pattern "Character class ([a-z]+)" "$file" "-E" "-E" "[a-z]+"
      
        260
            benchmark_pattern "Mixed class ([a-zA-Z0-9]+)" "$file" "-E" "-E" "[a-zA-Z0-9]+"
      
        261
            benchmark_pattern "Digit class ([0-9]+)" "$file" "-E" "-E" "[0-9]+"
      
        262
            benchmark_pattern "Alternation (cat|dog|fox)" "$file" "-E" "-E" "cat|dog|fox"
      
        263
            benchmark_pattern "Optional (colou?r)" "$file" "-E" "-E" "colou?r"
      
        264
            benchmark_pattern "One or more (hel+o)" "$file" "-E" "-E" "hel+o"
      
        265
        }
      
        266
        
        267
        run_anchor_benchmarks() {
      
        268
            echo -e "\n${BOLD}${BLUE}=== Anchor Patterns ===${NC}"
      
        269
            local file="$BENCH_DIR/english_large.txt"
      
        270
        
        271
            benchmark_pattern "Start anchor (^The)" "$file" "" "" "^The"
      
        272
            benchmark_pattern "End anchor (\\.$)" "$file" "" "" '\.$'
      
        273
            benchmark_pattern "Both anchors (^The.*dog$)" "$file" "-E" "-E" "^The.*dog$"
      
        274
            benchmark_pattern "Word start (\\<quick)" "$file" "" "" '\<quick'
      
        275
            benchmark_pattern "Word end (fox\\>)" "$file" "" "" 'fox\>'
      
        276
        }
      
        277
        
        278
        run_log_benchmarks() {
      
        279
            echo -e "\n${BOLD}${BLUE}=== Log File Patterns ===${NC}"
      
        280
            local file="$BENCH_DIR/logs_medium.txt"
      
        281
        
        282
            benchmark_pattern "Log level (ERROR)" "$file" "" "" "ERROR"
      
        283
            benchmark_pattern "Log level (-i warn)" "$file" "-i" "-i" "warn"
      
        284
            benchmark_pattern "Timestamp pattern ([0-9]{2}:[0-9]{2})" "$file" "-E" "-E" "[0-9]{2}:[0-9]{2}"
      
        285
            benchmark_pattern "Component (component-[0-9]+)" "$file" "-E" "-E" "component-[0-9]+"
      
        286
            benchmark_pattern "Multiple levels (ERROR|WARN)" "$file" "-E" "-E" "ERROR|WARN"
      
        287
        }
      
        288
        
        289
        run_code_benchmarks() {
      
        290
            echo -e "\n${BOLD}${BLUE}=== Code Pattern Matching ===${NC}"
      
        291
            local file="$BENCH_DIR/code_medium.txt"
      
        292
        
        293
            benchmark_pattern "Function name (function)" "$file" "" "" "function"
      
        294
            benchmark_pattern "Variable (const|let|var)" "$file" "-E" "-E" "const|let|var"
      
        295
            benchmark_pattern "Return statement (return)" "$file" "" "" "return"
      
        296
            benchmark_pattern "Console log (console\\.log)" "$file" "-E" "-E" "console\\.log"
      
        297
        }
      
        298
        
        299
        run_csv_benchmarks() {
      
        300
            echo -e "\n${BOLD}${BLUE}=== CSV/Data Pattern Matching ===${NC}"
      
        301
            local file="$BENCH_DIR/data_medium.csv"
      
        302
        
        303
            benchmark_pattern "Email pattern (@.*\\.com)" "$file" "-E" "-E" "@.*\\.com"
      
        304
            benchmark_pattern "Specific domain (domain50)" "$file" "" "" "domain50"
      
        305
            benchmark_pattern "User pattern (user_[0-9]+)" "$file" "-E" "-E" "user_[0-9]+"
      
        306
            benchmark_pattern "High score (,[89][0-9],)" "$file" "-E" "-E" ",[89][0-9],"
      
        307
        }
      
        308
        
        309
        run_special_benchmarks() {
      
        310
            echo -e "\n${BOLD}${BLUE}=== Special Cases ===${NC}"
      
        311
            local file="$BENCH_DIR/english_large.txt"
      
        312
        
        313
            benchmark_pattern "Invert match (-v error)" "$file" "-v" "-v" "error"
      
        314
            benchmark_pattern "Count only (-c the)" "$file" "-c" "-c" "the"
      
        315
            benchmark_pattern "Line number (-n hello)" "$file" "-n" "-n" "hello"
      
        316
            benchmark_pattern "Multiple patterns (cat|dog|bird|fish)" "$file" "-E" "-E" "cat|dog|bird|fish"
      
        317
            benchmark_pattern "Long alternation (the|and|for|with|from)" "$file" "-E" "-E" "the|and|for|with|from"
      
        318
        }
      
        319
        
        320
        run_scaling_benchmarks() {
      
        321
            echo -e "\n${BOLD}${BLUE}=== Scaling Tests ===${NC}"
      
        322
        
        323
            echo -e "  ${CYAN}Small file (~700KB):${NC}"
      
        324
            benchmark_pattern "  [a-z]+ on small" "$BENCH_DIR/english_small.txt" "-E" "-E" "[a-z]+"
      
        325
        
        326
            echo -e "  ${CYAN}Large file (~70MB):${NC}"
      
        327
            benchmark_pattern "  [a-z]+ on large" "$BENCH_DIR/english_large.txt" "-E" "-E" "[a-z]+"
      
        328
        
        329
            # Calculate scaling factor (get last two results)
      
        330
            local num_results=${#RESULT_FERP_TIMES[@]}
      
        331
            local small_ferp="${RESULT_FERP_TIMES[$((num_results-2))]}"
      
        332
            local large_ferp="${RESULT_FERP_TIMES[$((num_results-1))]}"
      
        333
            local small_grep="${RESULT_GREP_TIMES[$((num_results-2))]}"
      
        334
            local large_grep="${RESULT_GREP_TIMES[$((num_results-1))]}"
      
        335
        
        336
            echo -e "\n  ${CYAN}Scaling (large/small ratio):${NC}"
      
        337
            local ferp_scale=$(echo "scale=1; $large_ferp / $small_ferp" | bc -l 2>/dev/null || echo "N/A")
      
        338
            local grep_scale=$(echo "scale=1; $large_grep / $small_grep" | bc -l 2>/dev/null || echo "N/A")
      
        339
            echo -e "    ferp: ${ferp_scale}x  grep: ${grep_scale}x  (lower is better for large files)"
      
        340
        }
      
        341
        
        342
        #------------------------------------------------------------------------------
      
        343
        # Report Generation
      
        344
        #------------------------------------------------------------------------------
      
        345
        
        346
        print_summary() {
      
        347
            echo -e "\n${BOLD}${BLUE}══════════════════════════════════════════════════════════════${NC}"
      
        348
            echo -e "${BOLD}${BLUE}                        SUMMARY                                 ${NC}"
      
        349
            echo -e "${BOLD}${BLUE}══════════════════════════════════════════════════════════════${NC}"
      
        350
        
        351
            local total_ferp=0
      
        352
            local total_grep=0
      
        353
            local wins_ferp=0
      
        354
            local wins_grep=0
      
        355
            local count=${#RESULT_NAMES[@]}
      
        356
        
        357
            for i in "${!RESULT_NAMES[@]}"; do
      
        358
                local ft="${RESULT_FERP_TIMES[$i]}"
      
        359
                local gt="${RESULT_GREP_TIMES[$i]}"
      
        360
                total_ferp=$(echo "$total_ferp + $ft" | bc -l)
      
        361
                total_grep=$(echo "$total_grep + $gt" | bc -l)
      
        362
        
        363
                if (( $(echo "$ft < $gt" | bc -l) )); then
      
        364
                    wins_ferp=$((wins_ferp + 1))
      
        365
                else
      
        366
                    wins_grep=$((wins_grep + 1))
      
        367
                fi
      
        368
            done
      
        369
        
        370
            local avg_speedup=$(echo "scale=2; $total_grep / $total_ferp" | bc -l 2>/dev/null || echo "N/A")
      
        371
        
        372
            echo -e "\n${CYAN}Overall Statistics:${NC}"
      
        373
            echo -e "  Total benchmarks run: $count"
      
        374
            echo -e "  ferp wins: ${GREEN}$wins_ferp${NC}"
      
        375
            echo -e "  grep wins: ${RED}$wins_grep${NC}"
      
        376
            printf "  Total time - ferp: %.3fs  grep: %.3fs\n" "$total_ferp" "$total_grep"
      
        377
            echo -e "  ${BOLD}Average speedup: ${GREEN}${avg_speedup}x${NC}"
      
        378
        
        379
            echo -e "\n${CYAN}System Information:${NC}"
      
        380
            echo -e "  OS: $(uname -s) $(uname -r)"
      
        381
            echo -e "  CPU: $(sysctl -n machdep.cpu.brand_string 2>/dev/null || lscpu 2>/dev/null | grep 'Model name' | cut -d: -f2 | xargs || echo 'Unknown')"
      
        382
            echo -e "  ferp version: $($FERP --version 2>&1 | head -1 || echo 'Unknown')"
      
        383
            echo -e "  grep version: $($GREP --version 2>&1 | head -1 || echo 'Unknown')"
      
        384
            echo -e "  Runs per benchmark: $RUNS (median taken)"
      
        385
        
        386
            echo -e "\n${BOLD}${BLUE}══════════════════════════════════════════════════════════════${NC}"
      
        387
        }
      
        388
        
        389
        #------------------------------------------------------------------------------
      
        390
        # Main
      
        391
        #------------------------------------------------------------------------------
      
        392
        
        393
        main() {
      
        394
            echo -e "${BOLD}${BLUE}"
      
        395
            echo "╔══════════════════════════════════════════════════════════════╗"
      
        396
            echo "║           FERP vs grep Benchmark Suite                       ║"
      
        397
            echo "║           Comprehensive Performance Comparison               ║"
      
        398
            echo "╚══════════════════════════════════════════════════════════════╝"
      
        399
            echo -e "${NC}"
      
        400
        
        401
            check_prerequisites
      
        402
            create_test_files
      
        403
        
        404
            echo -e "\n${BOLD}${CYAN}Running benchmarks (${RUNS} runs each, reporting median)...${NC}"
      
        405
            echo -e "${CYAN}Format: ferp time | grep time | speedup (>1 = ferp faster)${NC}\n"
      
        406
        
        407
            run_literal_benchmarks
      
        408
            run_regex_benchmarks
      
        409
            run_anchor_benchmarks
      
        410
            run_log_benchmarks
      
        411
            run_code_benchmarks
      
        412
            run_csv_benchmarks
      
        413
            run_special_benchmarks
      
        414
            run_scaling_benchmarks
      
        415
        
        416
            print_summary
      
        417
        
        418
            echo -e "\n${GREEN}Benchmark complete!${NC}"
      
        419
        }
      
        420
        
        421
        # Run with optional arguments
      
        422
        if [[ "$1" == "-h" || "$1" == "--help" ]]; then
      
        423
            echo "Usage: $0 [OPTIONS]"
      
        424
            echo ""
      
        425
            echo "Options:"
      
        426
            echo "  -r, --runs N    Number of runs per benchmark (default: 3)"
      
        427
            echo "  -q, --quick     Quick mode (smaller files, fewer runs)"
      
        428
            echo "  -h, --help      Show this help"
      
        429
            exit 0
      
        430
        fi
      
        431
        
        432
        if [[ "$1" == "-q" || "$1" == "--quick" ]]; then
      
        433
            RUNS=1
      
        434
            SMALL_LINES=1000
      
        435
            MEDIUM_LINES=10000
      
        436
            LARGE_LINES=100000
      
        437
            echo -e "${YELLOW}Quick mode: reduced file sizes and single run${NC}"
      
        438
        fi
      
        439
        
        440
        if [[ "$1" == "-r" || "$1" == "--runs" ]]; then
      
        441
            RUNS="${2:-3}"
      
        442
        fi
      
        443
        
        444
        main

1	#!/usr/bin/env bash
2	#
3	# FERP vs grep Benchmark Suite
4	# Comprehensive performance comparison
5	#
6	# Requires: bash 4+, bc, python3 (for timing)
7	#
8
9	set -e
10
11	# Colors for output
12	RED='\033[0;31m'
13	GREEN='\033[0;32m'
14	YELLOW='\033[1;33m'
15	BLUE='\033[0;34m'
16	CYAN='\033[0;36m'
17	BOLD='\033[1m'
18	NC='\033[0m' # No Color
19
20	# Configuration
21	BENCH_DIR="/tmp/ferp_benchmark_$$"
22	FERP="./ferp"
23	GREP="grep"
24	RUNS=3 # Number of runs per benchmark (take median)
25
26	# Test file sizes
27	SMALL_LINES=10000 # ~700KB
28	MEDIUM_LINES=100000 # ~7MB
29	LARGE_LINES=1000000 # ~70MB
30
31	# Results storage (simple arrays for portability)
32	RESULT_NAMES=()
33	RESULT_FERP_TIMES=()
34	RESULT_GREP_TIMES=()
35
36	#------------------------------------------------------------------------------
37	# Utility Functions
38	#------------------------------------------------------------------------------
39
40	cleanup() {
41	echo -e "\n${CYAN}Cleaning up...${NC}"
42	rm -rf "$BENCH_DIR"
43	}
44
45	trap cleanup EXIT
46
47	die() {
48	echo -e "${RED}ERROR: $1${NC}" >&2
49	exit 1
50	}
51
52	check_prerequisites() {
53	echo -e "${CYAN}Checking prerequisites...${NC}"
54
55	# Check ferp exists
56	if [[ ! -x "$FERP" ]]; then
57	echo -e "${YELLOW}Building ferp (release mode)...${NC}"
58	make release >/dev/null 2>&1 \|\| die "Failed to build ferp"
59	fi
60
61	# Verify ferp works
62	echo "test" \| $FERP "test" >/dev/null 2>&1 \|\| die "ferp not working"
63
64	# Check grep exists
65	command -v $GREP >/dev/null 2>&1 \|\| die "grep not found"
66
67	echo -e "${GREEN}Prerequisites OK${NC}"
68	}
69
70	create_test_files() {
71	echo -e "\n${CYAN}Creating test files in $BENCH_DIR...${NC}"
72	mkdir -p "$BENCH_DIR"
73
74	# File 1: English-like text (varied content) - use awk for speed
75	echo -e " Creating english text file ($LARGE_LINES lines)..."
76	awk -v n="$LARGE_LINES" 'BEGIN {
77	lines[0] = "The quick brown fox jumps over the lazy dog near the riverbank."
78	lines[1] = "Hello world, this is line number %d of the benchmark test file."
79	lines[2] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit sed do."
80	lines[3] = "Error: connection timeout after 30000ms on server node-%d."
81	lines[4] = "DEBUG [2024-01-15 10:23:45] Processing request id=%d status=pending"
82	lines[5] = "user@example.com logged in from 192.168.1.%d at 12:00:00"
83	lines[6] = "WARNING: disk usage at %d%% on /dev/sda1 partition"
84	lines[7] = "Function calculate_total(items=[1,2,3]) returned value=%d"
85	lines[8] = "The API endpoint /api/v2/users/%d responded with HTTP 200 OK"
86	lines[9] = "Configuration: max_threads=16, timeout=5000, retry_count=3"
87	for (i = 1; i <= n; i++) {
88	idx = i % 10
89	if (idx == 0 \|\| idx == 2 \|\| idx == 9) {
90	print lines[idx]
91	} else if (idx == 3) {
92	printf lines[idx] "\n", i % 100
93	} else if (idx == 5) {
94	printf lines[idx] "\n", i % 256
95	} else if (idx == 6) {
96	printf lines[idx] "\n", 50 + (i % 50)
97	} else if (idx == 7) {
98	printf lines[idx] "\n", i * 42
99	} else {
100	printf lines[idx] "\n", i
101	}
102	}
103	}' > "$BENCH_DIR/english_large.txt"
104
105	# File 2: Log-like file (structured)
106	echo -e " Creating log file ($MEDIUM_LINES lines)..."
107	awk -v n="$MEDIUM_LINES" 'BEGIN {
108	levels[0] = "INFO"; levels[1] = "DEBUG"; levels[2] = "WARN"; levels[3] = "ERROR"
109	for (i = 1; i <= n; i++) {
110	day = 1 + (i % 28)
111	hour = i % 24
112	min = i % 60
113	sec = i % 60
114	comp = i % 20
115	printf "[2024-01-%02d %02d:%02d:%02d] %s: Message number %d from component-%d\n", \
116	day, hour, min, sec, levels[i % 4], i, comp
117	}
118	}' > "$BENCH_DIR/logs_medium.txt"
119
120	# File 3: Code-like file
121	echo -e " Creating code file ($MEDIUM_LINES lines)..."
122	awk -v n="$MEDIUM_LINES" 'BEGIN {
123	for (i = 1; i <= n; i++) {
124	idx = i % 8
125	if (idx == 0) printf "function process_data_%d(input) {\n", i
126	else if (idx == 1) print " const result = input.map(x => x * 2);"
127	else if (idx == 2) print " if (result.length > 0) {"
128	else if (idx == 3) print " console.log(\"Processing:\", result);"
129	else if (idx == 4) print " return result.filter(x => x > 10);"
130	else if (idx == 5) print " }"
131	else if (idx == 6) print " return [];"
132	else print "}"
133	}
134	}' > "$BENCH_DIR/code_medium.txt"
135
136	# File 4: CSV-like data
137	echo -e " Creating CSV file ($MEDIUM_LINES lines)..."
138	awk -v n="$MEDIUM_LINES" 'BEGIN {
139	print "id,name,email,score,timestamp"
140	srand()
141	for (i = 1; i <= n; i++) {
142	score = int(rand() * 100)
143	printf "%d,user_%d,user%d@domain%d.com,%d,%d\n", \
144	i, i, i, i % 100, score, 1700000000 + i
145	}
146	}' > "$BENCH_DIR/data_medium.csv"
147
148	# File 5: Small file for quick tests
149	echo -e " Creating small file ($SMALL_LINES lines)..."
150	head -n $SMALL_LINES "$BENCH_DIR/english_large.txt" > "$BENCH_DIR/english_small.txt"
151
152	# Print file sizes
153	echo -e "\n${CYAN}Test files created:${NC}"
154	ls -lh "$BENCH_DIR"/.txt "$BENCH_DIR"/.csv 2>/dev/null \| awk '{print " " $9 ": " $5}'
155	}
156
157	#------------------------------------------------------------------------------
158	# Benchmark Functions
159	#------------------------------------------------------------------------------
160
161	# Run a command multiple times and return median time
162	run_timed() {
163	local cmd="$1"
164	local times=()
165
166	for i in $(seq 1 $RUNS); do
167	# Use /usr/bin/time for portable timing
168	local t=$( { time eval "$cmd" >/dev/null 2>&1; } 2>&1 \| grep real \| sed 's/real[[:space:]]*//' )
169	# Convert to seconds (handles both 0m0.123s and 0.123 formats)
170	if [[ "$t" =~ ([0-9]+)m([0-9.]+)s ]]; then
171	local mins="${BASH_REMATCH[1]}"
172	local secs="${BASH_REMATCH[2]}"
173	t=$(echo "$mins * 60 + $secs" \| bc -l)
174	elif [[ "$t" =~ ^[0-9.]+$ ]]; then
175	: # already in seconds
176	else
177	t="999" # Error case
178	fi
179	times+=("$t")
180	done
181
182	# Return median (sort and take middle)
183	printf '%s\n' "${times[@]}" \| sort -n \| sed -n "$((($RUNS + 1) / 2))p"
184	}
185
186	# Alternative timing using date (more portable)
187	run_timed_portable() {
188	local cmd="$1"
189	local times=()
190
191	for i in $(seq 1 $RUNS); do
192	local start=$(python3 -c 'import time; print(time.time())' 2>/dev/null \|\| date +%s.%N)
193	eval "$cmd" >/dev/null 2>&1
194	local end=$(python3 -c 'import time; print(time.time())' 2>/dev/null \|\| date +%s.%N)
195	local t=$(echo "$end - $start" \| bc -l)
196	times+=("$t")
197	done
198
199	# Return median
200	printf '%s\n' "${times[@]}" \| sort -n \| sed -n "$((($RUNS + 1) / 2))p"
201	}
202
203	benchmark_pattern() {
204	local name="$1"
205	local file="$2"
206	local ferp_args="$3"
207	local grep_args="$4"
208	local pattern="$5"
209
210	printf " %-35s" "$name"
211
212	# Run ferp
213	local ferp_time=$(run_timed_portable "$FERP $ferp_args '$pattern' '$file'")
214
215	# Run grep
216	local grep_time=$(run_timed_portable "$GREP $grep_args '$pattern' '$file'")
217
218	# Calculate speedup
219	local speedup=$(echo "scale=2; $grep_time / $ferp_time" \| bc -l 2>/dev/null \|\| echo "N/A")
220
221	# Store results
222	RESULT_NAMES+=("$name")
223	RESULT_FERP_TIMES+=("$ferp_time")
224	RESULT_GREP_TIMES+=("$grep_time")
225
226	# Color-code the speedup
227	local color="$NC"
228	if (( $(echo "$speedup > 1.5" \| bc -l) )); then
229	color="$GREEN"
230	elif (( $(echo "$speedup < 0.8" \| bc -l) )); then
231	color="$RED"
232	fi
233
234	printf "ferp: %6.3fs grep: %6.3fs ${color}%5.2fx${NC}\n" "$ferp_time" "$grep_time" "$speedup"
235	}
236
237	#------------------------------------------------------------------------------
238	# Benchmark Suites
239	#------------------------------------------------------------------------------
240
241	run_literal_benchmarks() {
242	echo -e "\n${BOLD}${BLUE}=== Literal String Matching ===${NC}"
243	local file="$BENCH_DIR/english_large.txt"
244
245	benchmark_pattern "Simple word (hello)" "$file" "" "" "hello"
246	benchmark_pattern "Common word (the)" "$file" "" "" "the"
247	benchmark_pattern "Longer phrase (quick brown)" "$file" "" "" "quick brown"
248	benchmark_pattern "Case insensitive (-i hello)" "$file" "-i" "-i" "hello"
249	benchmark_pattern "Fixed string (-F hello)" "$file" "-F" "-F" "hello"
250	benchmark_pattern "Word boundary (-w the)" "$file" "-w" "-w" "the"
251	}
252
253	run_regex_benchmarks() {
254	echo -e "\n${BOLD}${BLUE}=== Regular Expression Matching ===${NC}"
255	local file="$BENCH_DIR/english_large.txt"
256
257	benchmark_pattern "Dot wildcard (h.llo)" "$file" "" "" "h.llo"
258	benchmark_pattern "Star quantifier (helo)" "$file" "" "" "helo"
259	benchmark_pattern "Character class ([a-z]+)" "$file" "-E" "-E" "[a-z]+"
260	benchmark_pattern "Mixed class ([a-zA-Z0-9]+)" "$file" "-E" "-E" "[a-zA-Z0-9]+"
261	benchmark_pattern "Digit class ([0-9]+)" "$file" "-E" "-E" "[0-9]+"
262	benchmark_pattern "Alternation (cat\|dog\|fox)" "$file" "-E" "-E" "cat\|dog\|fox"
263	benchmark_pattern "Optional (colou?r)" "$file" "-E" "-E" "colou?r"
264	benchmark_pattern "One or more (hel+o)" "$file" "-E" "-E" "hel+o"
265	}
266
267	run_anchor_benchmarks() {
268	echo -e "\n${BOLD}${BLUE}=== Anchor Patterns ===${NC}"
269	local file="$BENCH_DIR/english_large.txt"
270
271	benchmark_pattern "Start anchor (^The)" "$file" "" "" "^The"
272	benchmark_pattern "End anchor (\\.$)" "$file" "" "" '\.$'
273	benchmark_pattern "Both anchors (^The.dog$)" "$file" "-E" "-E" "^The.dog$"
274	benchmark_pattern "Word start (\\<quick)" "$file" "" "" '\<quick'
275	benchmark_pattern "Word end (fox\\>)" "$file" "" "" 'fox\>'
276	}
277
278	run_log_benchmarks() {
279	echo -e "\n${BOLD}${BLUE}=== Log File Patterns ===${NC}"
280	local file="$BENCH_DIR/logs_medium.txt"
281
282	benchmark_pattern "Log level (ERROR)" "$file" "" "" "ERROR"
283	benchmark_pattern "Log level (-i warn)" "$file" "-i" "-i" "warn"
284	benchmark_pattern "Timestamp pattern ([0-9]{2}:[0-9]{2})" "$file" "-E" "-E" "[0-9]{2}:[0-9]{2}"
285	benchmark_pattern "Component (component-[0-9]+)" "$file" "-E" "-E" "component-[0-9]+"
286	benchmark_pattern "Multiple levels (ERROR\|WARN)" "$file" "-E" "-E" "ERROR\|WARN"
287	}
288
289	run_code_benchmarks() {
290	echo -e "\n${BOLD}${BLUE}=== Code Pattern Matching ===${NC}"
291	local file="$BENCH_DIR/code_medium.txt"
292
293	benchmark_pattern "Function name (function)" "$file" "" "" "function"
294	benchmark_pattern "Variable (const\|let\|var)" "$file" "-E" "-E" "const\|let\|var"
295	benchmark_pattern "Return statement (return)" "$file" "" "" "return"
296	benchmark_pattern "Console log (console\\.log)" "$file" "-E" "-E" "console\\.log"
297	}
298
299	run_csv_benchmarks() {
300	echo -e "\n${BOLD}${BLUE}=== CSV/Data Pattern Matching ===${NC}"
301	local file="$BENCH_DIR/data_medium.csv"
302
303	benchmark_pattern "Email pattern (@.\\.com)" "$file" "-E" "-E" "@.\\.com"
304	benchmark_pattern "Specific domain (domain50)" "$file" "" "" "domain50"
305	benchmark_pattern "User pattern (user_[0-9]+)" "$file" "-E" "-E" "user_[0-9]+"
306	benchmark_pattern "High score (,[89][0-9],)" "$file" "-E" "-E" ",[89][0-9],"
307	}
308
309	run_special_benchmarks() {
310	echo -e "\n${BOLD}${BLUE}=== Special Cases ===${NC}"
311	local file="$BENCH_DIR/english_large.txt"
312
313	benchmark_pattern "Invert match (-v error)" "$file" "-v" "-v" "error"
314	benchmark_pattern "Count only (-c the)" "$file" "-c" "-c" "the"
315	benchmark_pattern "Line number (-n hello)" "$file" "-n" "-n" "hello"
316	benchmark_pattern "Multiple patterns (cat\|dog\|bird\|fish)" "$file" "-E" "-E" "cat\|dog\|bird\|fish"
317	benchmark_pattern "Long alternation (the\|and\|for\|with\|from)" "$file" "-E" "-E" "the\|and\|for\|with\|from"
318	}
319
320	run_scaling_benchmarks() {
321	echo -e "\n${BOLD}${BLUE}=== Scaling Tests ===${NC}"
322
323	echo -e " ${CYAN}Small file (~700KB):${NC}"
324	benchmark_pattern " [a-z]+ on small" "$BENCH_DIR/english_small.txt" "-E" "-E" "[a-z]+"
325
326	echo -e " ${CYAN}Large file (~70MB):${NC}"
327	benchmark_pattern " [a-z]+ on large" "$BENCH_DIR/english_large.txt" "-E" "-E" "[a-z]+"
328
329	# Calculate scaling factor (get last two results)
330	local num_results=${#RESULT_FERP_TIMES[@]}
331	local small_ferp="${RESULT_FERP_TIMES[$((num_results-2))]}"
332	local large_ferp="${RESULT_FERP_TIMES[$((num_results-1))]}"
333	local small_grep="${RESULT_GREP_TIMES[$((num_results-2))]}"
334	local large_grep="${RESULT_GREP_TIMES[$((num_results-1))]}"
335
336	echo -e "\n ${CYAN}Scaling (large/small ratio):${NC}"
337	local ferp_scale=$(echo "scale=1; $large_ferp / $small_ferp" \| bc -l 2>/dev/null \|\| echo "N/A")
338	local grep_scale=$(echo "scale=1; $large_grep / $small_grep" \| bc -l 2>/dev/null \|\| echo "N/A")
339	echo -e " ferp: ${ferp_scale}x grep: ${grep_scale}x (lower is better for large files)"
340	}
341
342	#------------------------------------------------------------------------------
343	# Report Generation
344	#------------------------------------------------------------------------------
345
346	print_summary() {
347	echo -e "\n${BOLD}${BLUE}══════════════════════════════════════════════════════════════${NC}"
348	echo -e "${BOLD}${BLUE} SUMMARY ${NC}"
349	echo -e "${BOLD}${BLUE}══════════════════════════════════════════════════════════════${NC}"
350
351	local total_ferp=0
352	local total_grep=0
353	local wins_ferp=0
354	local wins_grep=0
355	local count=${#RESULT_NAMES[@]}
356
357	for i in "${!RESULT_NAMES[@]}"; do
358	local ft="${RESULT_FERP_TIMES[$i]}"
359	local gt="${RESULT_GREP_TIMES[$i]}"
360	total_ferp=$(echo "$total_ferp + $ft" \| bc -l)
361	total_grep=$(echo "$total_grep + $gt" \| bc -l)
362
363	if (( $(echo "$ft < $gt" \| bc -l) )); then
364	wins_ferp=$((wins_ferp + 1))
365	else
366	wins_grep=$((wins_grep + 1))
367	fi
368	done
369
370	local avg_speedup=$(echo "scale=2; $total_grep / $total_ferp" \| bc -l 2>/dev/null \|\| echo "N/A")
371
372	echo -e "\n${CYAN}Overall Statistics:${NC}"
373	echo -e " Total benchmarks run: $count"
374	echo -e " ferp wins: ${GREEN}$wins_ferp${NC}"
375	echo -e " grep wins: ${RED}$wins_grep${NC}"
376	printf " Total time - ferp: %.3fs grep: %.3fs\n" "$total_ferp" "$total_grep"
377	echo -e " ${BOLD}Average speedup: ${GREEN}${avg_speedup}x${NC}"
378
379	echo -e "\n${CYAN}System Information:${NC}"
380	echo -e " OS: $(uname -s) $(uname -r)"
381	echo -e " CPU: $(sysctl -n machdep.cpu.brand_string 2>/dev/null \|\| lscpu 2>/dev/null \| grep 'Model name' \| cut -d: -f2 \| xargs \|\| echo 'Unknown')"
382	echo -e " ferp version: $($FERP --version 2>&1 \| head -1 \|\| echo 'Unknown')"
383	echo -e " grep version: $($GREP --version 2>&1 \| head -1 \|\| echo 'Unknown')"
384	echo -e " Runs per benchmark: $RUNS (median taken)"
385
386	echo -e "\n${BOLD}${BLUE}══════════════════════════════════════════════════════════════${NC}"
387	}
388
389	#------------------------------------------------------------------------------
390	# Main
391	#------------------------------------------------------------------------------
392
393	main() {
394	echo -e "${BOLD}${BLUE}"
395	echo "╔══════════════════════════════════════════════════════════════╗"
396	echo "║ FERP vs grep Benchmark Suite ║"
397	echo "║ Comprehensive Performance Comparison ║"
398	echo "╚══════════════════════════════════════════════════════════════╝"
399	echo -e "${NC}"
400
401	check_prerequisites
402	create_test_files
403
404	echo -e "\n${BOLD}${CYAN}Running benchmarks (${RUNS} runs each, reporting median)...${NC}"
405	echo -e "${CYAN}Format: ferp time \| grep time \| speedup (>1 = ferp faster)${NC}\n"
406
407	run_literal_benchmarks
408	run_regex_benchmarks
409	run_anchor_benchmarks
410	run_log_benchmarks
411	run_code_benchmarks
412	run_csv_benchmarks
413	run_special_benchmarks
414	run_scaling_benchmarks
415
416	print_summary
417
418	echo -e "\n${GREEN}Benchmark complete!${NC}"
419	}
420
421	# Run with optional arguments
422	if [[ "$1" == "-h" \|\| "$1" == "--help" ]]; then
423	echo "Usage: $0 [OPTIONS]"
424	echo ""
425	echo "Options:"
426	echo " -r, --runs N Number of runs per benchmark (default: 3)"
427	echo " -q, --quick Quick mode (smaller files, fewer runs)"
428	echo " -h, --help Show this help"
429	exit 0
430	fi
431
432	if [[ "$1" == "-q" \|\| "$1" == "--quick" ]]; then
433	RUNS=1
434	SMALL_LINES=1000
435	MEDIUM_LINES=10000
436	LARGE_LINES=100000
437	echo -e "${YELLOW}Quick mode: reduced file sizes and single run${NC}"
438	fi
439
440	if [[ "$1" == "-r" \|\| "$1" == "--runs" ]]; then
441	RUNS="${2:-3}"
442	fi
443
444	main