#!/usr/bin/env bash # # FERP vs grep Benchmark Suite # Comprehensive performance comparison # # Requires: bash 4+, bc, python3 (for timing) # set -e # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' BOLD='\033[1m' NC='\033[0m' # No Color # Configuration BENCH_DIR="/tmp/ferp_benchmark_$$" FERP="./ferp" GREP="grep" RUNS=3 # Number of runs per benchmark (take median) # Test file sizes SMALL_LINES=10000 # ~700KB MEDIUM_LINES=100000 # ~7MB LARGE_LINES=1000000 # ~70MB # Results storage (simple arrays for portability) RESULT_NAMES=() RESULT_FERP_TIMES=() RESULT_GREP_TIMES=() #------------------------------------------------------------------------------ # Utility Functions #------------------------------------------------------------------------------ cleanup() { echo -e "\n${CYAN}Cleaning up...${NC}" rm -rf "$BENCH_DIR" } trap cleanup EXIT die() { echo -e "${RED}ERROR: $1${NC}" >&2 exit 1 } check_prerequisites() { echo -e "${CYAN}Checking prerequisites...${NC}" # Check ferp exists if [[ ! -x "$FERP" ]]; then echo -e "${YELLOW}Building ferp (release mode)...${NC}" make release >/dev/null 2>&1 || die "Failed to build ferp" fi # Verify ferp works echo "test" | $FERP "test" >/dev/null 2>&1 || die "ferp not working" # Check grep exists command -v $GREP >/dev/null 2>&1 || die "grep not found" echo -e "${GREEN}Prerequisites OK${NC}" } create_test_files() { echo -e "\n${CYAN}Creating test files in $BENCH_DIR...${NC}" mkdir -p "$BENCH_DIR" # File 1: English-like text (varied content) - use awk for speed echo -e " Creating english text file ($LARGE_LINES lines)..." awk -v n="$LARGE_LINES" 'BEGIN { lines[0] = "The quick brown fox jumps over the lazy dog near the riverbank." lines[1] = "Hello world, this is line number %d of the benchmark test file." lines[2] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit sed do." lines[3] = "Error: connection timeout after 30000ms on server node-%d." lines[4] = "DEBUG [2024-01-15 10:23:45] Processing request id=%d status=pending" lines[5] = "user@example.com logged in from 192.168.1.%d at 12:00:00" lines[6] = "WARNING: disk usage at %d%% on /dev/sda1 partition" lines[7] = "Function calculate_total(items=[1,2,3]) returned value=%d" lines[8] = "The API endpoint /api/v2/users/%d responded with HTTP 200 OK" lines[9] = "Configuration: max_threads=16, timeout=5000, retry_count=3" for (i = 1; i <= n; i++) { idx = i % 10 if (idx == 0 || idx == 2 || idx == 9) { print lines[idx] } else if (idx == 3) { printf lines[idx] "\n", i % 100 } else if (idx == 5) { printf lines[idx] "\n", i % 256 } else if (idx == 6) { printf lines[idx] "\n", 50 + (i % 50) } else if (idx == 7) { printf lines[idx] "\n", i * 42 } else { printf lines[idx] "\n", i } } }' > "$BENCH_DIR/english_large.txt" # File 2: Log-like file (structured) echo -e " Creating log file ($MEDIUM_LINES lines)..." awk -v n="$MEDIUM_LINES" 'BEGIN { levels[0] = "INFO"; levels[1] = "DEBUG"; levels[2] = "WARN"; levels[3] = "ERROR" for (i = 1; i <= n; i++) { day = 1 + (i % 28) hour = i % 24 min = i % 60 sec = i % 60 comp = i % 20 printf "[2024-01-%02d %02d:%02d:%02d] %s: Message number %d from component-%d\n", \ day, hour, min, sec, levels[i % 4], i, comp } }' > "$BENCH_DIR/logs_medium.txt" # File 3: Code-like file echo -e " Creating code file ($MEDIUM_LINES lines)..." awk -v n="$MEDIUM_LINES" 'BEGIN { for (i = 1; i <= n; i++) { idx = i % 8 if (idx == 0) printf "function process_data_%d(input) {\n", i else if (idx == 1) print " const result = input.map(x => x * 2);" else if (idx == 2) print " if (result.length > 0) {" else if (idx == 3) print " console.log(\"Processing:\", result);" else if (idx == 4) print " return result.filter(x => x > 10);" else if (idx == 5) print " }" else if (idx == 6) print " return [];" else print "}" } }' > "$BENCH_DIR/code_medium.txt" # File 4: CSV-like data echo -e " Creating CSV file ($MEDIUM_LINES lines)..." awk -v n="$MEDIUM_LINES" 'BEGIN { print "id,name,email,score,timestamp" srand() for (i = 1; i <= n; i++) { score = int(rand() * 100) printf "%d,user_%d,user%d@domain%d.com,%d,%d\n", \ i, i, i, i % 100, score, 1700000000 + i } }' > "$BENCH_DIR/data_medium.csv" # File 5: Small file for quick tests echo -e " Creating small file ($SMALL_LINES lines)..." head -n $SMALL_LINES "$BENCH_DIR/english_large.txt" > "$BENCH_DIR/english_small.txt" # Print file sizes echo -e "\n${CYAN}Test files created:${NC}" ls -lh "$BENCH_DIR"/*.txt "$BENCH_DIR"/*.csv 2>/dev/null | awk '{print " " $9 ": " $5}' } #------------------------------------------------------------------------------ # Benchmark Functions #------------------------------------------------------------------------------ # Run a command multiple times and return median time run_timed() { local cmd="$1" local times=() for i in $(seq 1 $RUNS); do # Use /usr/bin/time for portable timing local t=$( { time eval "$cmd" >/dev/null 2>&1; } 2>&1 | grep real | sed 's/real[[:space:]]*//' ) # Convert to seconds (handles both 0m0.123s and 0.123 formats) if [[ "$t" =~ ([0-9]+)m([0-9.]+)s ]]; then local mins="${BASH_REMATCH[1]}" local secs="${BASH_REMATCH[2]}" t=$(echo "$mins * 60 + $secs" | bc -l) elif [[ "$t" =~ ^[0-9.]+$ ]]; then : # already in seconds else t="999" # Error case fi times+=("$t") done # Return median (sort and take middle) printf '%s\n' "${times[@]}" | sort -n | sed -n "$((($RUNS + 1) / 2))p" } # Alternative timing using date (more portable) run_timed_portable() { local cmd="$1" local times=() for i in $(seq 1 $RUNS); do local start=$(python3 -c 'import time; print(time.time())' 2>/dev/null || date +%s.%N) eval "$cmd" >/dev/null 2>&1 local end=$(python3 -c 'import time; print(time.time())' 2>/dev/null || date +%s.%N) local t=$(echo "$end - $start" | bc -l) times+=("$t") done # Return median printf '%s\n' "${times[@]}" | sort -n | sed -n "$((($RUNS + 1) / 2))p" } benchmark_pattern() { local name="$1" local file="$2" local ferp_args="$3" local grep_args="$4" local pattern="$5" printf " %-35s" "$name" # Run ferp local ferp_time=$(run_timed_portable "$FERP $ferp_args '$pattern' '$file'") # Run grep local grep_time=$(run_timed_portable "$GREP $grep_args '$pattern' '$file'") # Calculate speedup local speedup=$(echo "scale=2; $grep_time / $ferp_time" | bc -l 2>/dev/null || echo "N/A") # Store results RESULT_NAMES+=("$name") RESULT_FERP_TIMES+=("$ferp_time") RESULT_GREP_TIMES+=("$grep_time") # Color-code the speedup local color="$NC" if (( $(echo "$speedup > 1.5" | bc -l) )); then color="$GREEN" elif (( $(echo "$speedup < 0.8" | bc -l) )); then color="$RED" fi printf "ferp: %6.3fs grep: %6.3fs ${color}%5.2fx${NC}\n" "$ferp_time" "$grep_time" "$speedup" } #------------------------------------------------------------------------------ # Benchmark Suites #------------------------------------------------------------------------------ run_literal_benchmarks() { echo -e "\n${BOLD}${BLUE}=== Literal String Matching ===${NC}" local file="$BENCH_DIR/english_large.txt" benchmark_pattern "Simple word (hello)" "$file" "" "" "hello" benchmark_pattern "Common word (the)" "$file" "" "" "the" benchmark_pattern "Longer phrase (quick brown)" "$file" "" "" "quick brown" benchmark_pattern "Case insensitive (-i hello)" "$file" "-i" "-i" "hello" benchmark_pattern "Fixed string (-F hello)" "$file" "-F" "-F" "hello" benchmark_pattern "Word boundary (-w the)" "$file" "-w" "-w" "the" } run_regex_benchmarks() { echo -e "\n${BOLD}${BLUE}=== Regular Expression Matching ===${NC}" local file="$BENCH_DIR/english_large.txt" benchmark_pattern "Dot wildcard (h.llo)" "$file" "" "" "h.llo" benchmark_pattern "Star quantifier (hel*o)" "$file" "" "" "hel*o" benchmark_pattern "Character class ([a-z]+)" "$file" "-E" "-E" "[a-z]+" benchmark_pattern "Mixed class ([a-zA-Z0-9]+)" "$file" "-E" "-E" "[a-zA-Z0-9]+" benchmark_pattern "Digit class ([0-9]+)" "$file" "-E" "-E" "[0-9]+" benchmark_pattern "Alternation (cat|dog|fox)" "$file" "-E" "-E" "cat|dog|fox" benchmark_pattern "Optional (colou?r)" "$file" "-E" "-E" "colou?r" benchmark_pattern "One or more (hel+o)" "$file" "-E" "-E" "hel+o" } run_anchor_benchmarks() { echo -e "\n${BOLD}${BLUE}=== Anchor Patterns ===${NC}" local file="$BENCH_DIR/english_large.txt" benchmark_pattern "Start anchor (^The)" "$file" "" "" "^The" benchmark_pattern "End anchor (\\.$)" "$file" "" "" '\.$' benchmark_pattern "Both anchors (^The.*dog$)" "$file" "-E" "-E" "^The.*dog$" benchmark_pattern "Word start (\\)" "$file" "" "" 'fox\>' } run_log_benchmarks() { echo -e "\n${BOLD}${BLUE}=== Log File Patterns ===${NC}" local file="$BENCH_DIR/logs_medium.txt" benchmark_pattern "Log level (ERROR)" "$file" "" "" "ERROR" benchmark_pattern "Log level (-i warn)" "$file" "-i" "-i" "warn" benchmark_pattern "Timestamp pattern ([0-9]{2}:[0-9]{2})" "$file" "-E" "-E" "[0-9]{2}:[0-9]{2}" benchmark_pattern "Component (component-[0-9]+)" "$file" "-E" "-E" "component-[0-9]+" benchmark_pattern "Multiple levels (ERROR|WARN)" "$file" "-E" "-E" "ERROR|WARN" } run_code_benchmarks() { echo -e "\n${BOLD}${BLUE}=== Code Pattern Matching ===${NC}" local file="$BENCH_DIR/code_medium.txt" benchmark_pattern "Function name (function)" "$file" "" "" "function" benchmark_pattern "Variable (const|let|var)" "$file" "-E" "-E" "const|let|var" benchmark_pattern "Return statement (return)" "$file" "" "" "return" benchmark_pattern "Console log (console\\.log)" "$file" "-E" "-E" "console\\.log" } run_csv_benchmarks() { echo -e "\n${BOLD}${BLUE}=== CSV/Data Pattern Matching ===${NC}" local file="$BENCH_DIR/data_medium.csv" benchmark_pattern "Email pattern (@.*\\.com)" "$file" "-E" "-E" "@.*\\.com" benchmark_pattern "Specific domain (domain50)" "$file" "" "" "domain50" benchmark_pattern "User pattern (user_[0-9]+)" "$file" "-E" "-E" "user_[0-9]+" benchmark_pattern "High score (,[89][0-9],)" "$file" "-E" "-E" ",[89][0-9]," } run_special_benchmarks() { echo -e "\n${BOLD}${BLUE}=== Special Cases ===${NC}" local file="$BENCH_DIR/english_large.txt" benchmark_pattern "Invert match (-v error)" "$file" "-v" "-v" "error" benchmark_pattern "Count only (-c the)" "$file" "-c" "-c" "the" benchmark_pattern "Line number (-n hello)" "$file" "-n" "-n" "hello" benchmark_pattern "Multiple patterns (cat|dog|bird|fish)" "$file" "-E" "-E" "cat|dog|bird|fish" benchmark_pattern "Long alternation (the|and|for|with|from)" "$file" "-E" "-E" "the|and|for|with|from" } run_scaling_benchmarks() { echo -e "\n${BOLD}${BLUE}=== Scaling Tests ===${NC}" echo -e " ${CYAN}Small file (~700KB):${NC}" benchmark_pattern " [a-z]+ on small" "$BENCH_DIR/english_small.txt" "-E" "-E" "[a-z]+" echo -e " ${CYAN}Large file (~70MB):${NC}" benchmark_pattern " [a-z]+ on large" "$BENCH_DIR/english_large.txt" "-E" "-E" "[a-z]+" # Calculate scaling factor (get last two results) local num_results=${#RESULT_FERP_TIMES[@]} local small_ferp="${RESULT_FERP_TIMES[$((num_results-2))]}" local large_ferp="${RESULT_FERP_TIMES[$((num_results-1))]}" local small_grep="${RESULT_GREP_TIMES[$((num_results-2))]}" local large_grep="${RESULT_GREP_TIMES[$((num_results-1))]}" echo -e "\n ${CYAN}Scaling (large/small ratio):${NC}" local ferp_scale=$(echo "scale=1; $large_ferp / $small_ferp" | bc -l 2>/dev/null || echo "N/A") local grep_scale=$(echo "scale=1; $large_grep / $small_grep" | bc -l 2>/dev/null || echo "N/A") echo -e " ferp: ${ferp_scale}x grep: ${grep_scale}x (lower is better for large files)" } #------------------------------------------------------------------------------ # Report Generation #------------------------------------------------------------------------------ print_summary() { echo -e "\n${BOLD}${BLUE}══════════════════════════════════════════════════════════════${NC}" echo -e "${BOLD}${BLUE} SUMMARY ${NC}" echo -e "${BOLD}${BLUE}══════════════════════════════════════════════════════════════${NC}" local total_ferp=0 local total_grep=0 local wins_ferp=0 local wins_grep=0 local count=${#RESULT_NAMES[@]} for i in "${!RESULT_NAMES[@]}"; do local ft="${RESULT_FERP_TIMES[$i]}" local gt="${RESULT_GREP_TIMES[$i]}" total_ferp=$(echo "$total_ferp + $ft" | bc -l) total_grep=$(echo "$total_grep + $gt" | bc -l) if (( $(echo "$ft < $gt" | bc -l) )); then wins_ferp=$((wins_ferp + 1)) else wins_grep=$((wins_grep + 1)) fi done local avg_speedup=$(echo "scale=2; $total_grep / $total_ferp" | bc -l 2>/dev/null || echo "N/A") echo -e "\n${CYAN}Overall Statistics:${NC}" echo -e " Total benchmarks run: $count" echo -e " ferp wins: ${GREEN}$wins_ferp${NC}" echo -e " grep wins: ${RED}$wins_grep${NC}" printf " Total time - ferp: %.3fs grep: %.3fs\n" "$total_ferp" "$total_grep" echo -e " ${BOLD}Average speedup: ${GREEN}${avg_speedup}x${NC}" echo -e "\n${CYAN}System Information:${NC}" echo -e " OS: $(uname -s) $(uname -r)" echo -e " CPU: $(sysctl -n machdep.cpu.brand_string 2>/dev/null || lscpu 2>/dev/null | grep 'Model name' | cut -d: -f2 | xargs || echo 'Unknown')" echo -e " ferp version: $($FERP --version 2>&1 | head -1 || echo 'Unknown')" echo -e " grep version: $($GREP --version 2>&1 | head -1 || echo 'Unknown')" echo -e " Runs per benchmark: $RUNS (median taken)" echo -e "\n${BOLD}${BLUE}══════════════════════════════════════════════════════════════${NC}" } #------------------------------------------------------------------------------ # Main #------------------------------------------------------------------------------ main() { echo -e "${BOLD}${BLUE}" echo "╔══════════════════════════════════════════════════════════════╗" echo "║ FERP vs grep Benchmark Suite ║" echo "║ Comprehensive Performance Comparison ║" echo "╚══════════════════════════════════════════════════════════════╝" echo -e "${NC}" check_prerequisites create_test_files echo -e "\n${BOLD}${CYAN}Running benchmarks (${RUNS} runs each, reporting median)...${NC}" echo -e "${CYAN}Format: ferp time | grep time | speedup (>1 = ferp faster)${NC}\n" run_literal_benchmarks run_regex_benchmarks run_anchor_benchmarks run_log_benchmarks run_code_benchmarks run_csv_benchmarks run_special_benchmarks run_scaling_benchmarks print_summary echo -e "\n${GREEN}Benchmark complete!${NC}" } # Run with optional arguments if [[ "$1" == "-h" || "$1" == "--help" ]]; then echo "Usage: $0 [OPTIONS]" echo "" echo "Options:" echo " -r, --runs N Number of runs per benchmark (default: 3)" echo " -q, --quick Quick mode (smaller files, fewer runs)" echo " -h, --help Show this help" exit 0 fi if [[ "$1" == "-q" || "$1" == "--quick" ]]; then RUNS=1 SMALL_LINES=1000 MEDIUM_LINES=10000 LARGE_LINES=100000 echo -e "${YELLOW}Quick mode: reduced file sizes and single run${NC}" fi if [[ "$1" == "-r" || "$1" == "--runs" ]]; then RUNS="${2:-3}" fi main