Bash · 17232 bytes Raw Blame History
1 #!/usr/bin/env bash
2 #
3 # FERP vs grep Benchmark Suite
4 # Comprehensive performance comparison
5 #
6 # Requires: bash 4+, bc, python3 (for timing)
7 #
8
9 set -e
10
11 # Colors for output
12 RED='\033[0;31m'
13 GREEN='\033[0;32m'
14 YELLOW='\033[1;33m'
15 BLUE='\033[0;34m'
16 CYAN='\033[0;36m'
17 BOLD='\033[1m'
18 NC='\033[0m' # No Color
19
20 # Configuration
21 BENCH_DIR="/tmp/ferp_benchmark_$$"
22 FERP="./ferp"
23 GREP="grep"
24 RUNS=3 # Number of runs per benchmark (take median)
25
26 # Test file sizes
27 SMALL_LINES=10000 # ~700KB
28 MEDIUM_LINES=100000 # ~7MB
29 LARGE_LINES=1000000 # ~70MB
30
31 # Results storage (simple arrays for portability)
32 RESULT_NAMES=()
33 RESULT_FERP_TIMES=()
34 RESULT_GREP_TIMES=()
35
36 #------------------------------------------------------------------------------
37 # Utility Functions
38 #------------------------------------------------------------------------------
39
40 cleanup() {
41 echo -e "\n${CYAN}Cleaning up...${NC}"
42 rm -rf "$BENCH_DIR"
43 }
44
45 trap cleanup EXIT
46
47 die() {
48 echo -e "${RED}ERROR: $1${NC}" >&2
49 exit 1
50 }
51
52 check_prerequisites() {
53 echo -e "${CYAN}Checking prerequisites...${NC}"
54
55 # Check ferp exists
56 if [[ ! -x "$FERP" ]]; then
57 echo -e "${YELLOW}Building ferp (release mode)...${NC}"
58 make release >/dev/null 2>&1 || die "Failed to build ferp"
59 fi
60
61 # Verify ferp works
62 echo "test" | $FERP "test" >/dev/null 2>&1 || die "ferp not working"
63
64 # Check grep exists
65 command -v $GREP >/dev/null 2>&1 || die "grep not found"
66
67 echo -e "${GREEN}Prerequisites OK${NC}"
68 }
69
70 create_test_files() {
71 echo -e "\n${CYAN}Creating test files in $BENCH_DIR...${NC}"
72 mkdir -p "$BENCH_DIR"
73
74 # File 1: English-like text (varied content) - use awk for speed
75 echo -e " Creating english text file ($LARGE_LINES lines)..."
76 awk -v n="$LARGE_LINES" 'BEGIN {
77 lines[0] = "The quick brown fox jumps over the lazy dog near the riverbank."
78 lines[1] = "Hello world, this is line number %d of the benchmark test file."
79 lines[2] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit sed do."
80 lines[3] = "Error: connection timeout after 30000ms on server node-%d."
81 lines[4] = "DEBUG [2024-01-15 10:23:45] Processing request id=%d status=pending"
82 lines[5] = "user@example.com logged in from 192.168.1.%d at 12:00:00"
83 lines[6] = "WARNING: disk usage at %d%% on /dev/sda1 partition"
84 lines[7] = "Function calculate_total(items=[1,2,3]) returned value=%d"
85 lines[8] = "The API endpoint /api/v2/users/%d responded with HTTP 200 OK"
86 lines[9] = "Configuration: max_threads=16, timeout=5000, retry_count=3"
87 for (i = 1; i <= n; i++) {
88 idx = i % 10
89 if (idx == 0 || idx == 2 || idx == 9) {
90 print lines[idx]
91 } else if (idx == 3) {
92 printf lines[idx] "\n", i % 100
93 } else if (idx == 5) {
94 printf lines[idx] "\n", i % 256
95 } else if (idx == 6) {
96 printf lines[idx] "\n", 50 + (i % 50)
97 } else if (idx == 7) {
98 printf lines[idx] "\n", i * 42
99 } else {
100 printf lines[idx] "\n", i
101 }
102 }
103 }' > "$BENCH_DIR/english_large.txt"
104
105 # File 2: Log-like file (structured)
106 echo -e " Creating log file ($MEDIUM_LINES lines)..."
107 awk -v n="$MEDIUM_LINES" 'BEGIN {
108 levels[0] = "INFO"; levels[1] = "DEBUG"; levels[2] = "WARN"; levels[3] = "ERROR"
109 for (i = 1; i <= n; i++) {
110 day = 1 + (i % 28)
111 hour = i % 24
112 min = i % 60
113 sec = i % 60
114 comp = i % 20
115 printf "[2024-01-%02d %02d:%02d:%02d] %s: Message number %d from component-%d\n", \
116 day, hour, min, sec, levels[i % 4], i, comp
117 }
118 }' > "$BENCH_DIR/logs_medium.txt"
119
120 # File 3: Code-like file
121 echo -e " Creating code file ($MEDIUM_LINES lines)..."
122 awk -v n="$MEDIUM_LINES" 'BEGIN {
123 for (i = 1; i <= n; i++) {
124 idx = i % 8
125 if (idx == 0) printf "function process_data_%d(input) {\n", i
126 else if (idx == 1) print " const result = input.map(x => x * 2);"
127 else if (idx == 2) print " if (result.length > 0) {"
128 else if (idx == 3) print " console.log(\"Processing:\", result);"
129 else if (idx == 4) print " return result.filter(x => x > 10);"
130 else if (idx == 5) print " }"
131 else if (idx == 6) print " return [];"
132 else print "}"
133 }
134 }' > "$BENCH_DIR/code_medium.txt"
135
136 # File 4: CSV-like data
137 echo -e " Creating CSV file ($MEDIUM_LINES lines)..."
138 awk -v n="$MEDIUM_LINES" 'BEGIN {
139 print "id,name,email,score,timestamp"
140 srand()
141 for (i = 1; i <= n; i++) {
142 score = int(rand() * 100)
143 printf "%d,user_%d,user%d@domain%d.com,%d,%d\n", \
144 i, i, i, i % 100, score, 1700000000 + i
145 }
146 }' > "$BENCH_DIR/data_medium.csv"
147
148 # File 5: Small file for quick tests
149 echo -e " Creating small file ($SMALL_LINES lines)..."
150 head -n $SMALL_LINES "$BENCH_DIR/english_large.txt" > "$BENCH_DIR/english_small.txt"
151
152 # Print file sizes
153 echo -e "\n${CYAN}Test files created:${NC}"
154 ls -lh "$BENCH_DIR"/*.txt "$BENCH_DIR"/*.csv 2>/dev/null | awk '{print " " $9 ": " $5}'
155 }
156
157 #------------------------------------------------------------------------------
158 # Benchmark Functions
159 #------------------------------------------------------------------------------
160
161 # Run a command multiple times and return median time
162 run_timed() {
163 local cmd="$1"
164 local times=()
165
166 for i in $(seq 1 $RUNS); do
167 # Use /usr/bin/time for portable timing
168 local t=$( { time eval "$cmd" >/dev/null 2>&1; } 2>&1 | grep real | sed 's/real[[:space:]]*//' )
169 # Convert to seconds (handles both 0m0.123s and 0.123 formats)
170 if [[ "$t" =~ ([0-9]+)m([0-9.]+)s ]]; then
171 local mins="${BASH_REMATCH[1]}"
172 local secs="${BASH_REMATCH[2]}"
173 t=$(echo "$mins * 60 + $secs" | bc -l)
174 elif [[ "$t" =~ ^[0-9.]+$ ]]; then
175 : # already in seconds
176 else
177 t="999" # Error case
178 fi
179 times+=("$t")
180 done
181
182 # Return median (sort and take middle)
183 printf '%s\n' "${times[@]}" | sort -n | sed -n "$((($RUNS + 1) / 2))p"
184 }
185
186 # Alternative timing using date (more portable)
187 run_timed_portable() {
188 local cmd="$1"
189 local times=()
190
191 for i in $(seq 1 $RUNS); do
192 local start=$(python3 -c 'import time; print(time.time())' 2>/dev/null || date +%s.%N)
193 eval "$cmd" >/dev/null 2>&1
194 local end=$(python3 -c 'import time; print(time.time())' 2>/dev/null || date +%s.%N)
195 local t=$(echo "$end - $start" | bc -l)
196 times+=("$t")
197 done
198
199 # Return median
200 printf '%s\n' "${times[@]}" | sort -n | sed -n "$((($RUNS + 1) / 2))p"
201 }
202
203 benchmark_pattern() {
204 local name="$1"
205 local file="$2"
206 local ferp_args="$3"
207 local grep_args="$4"
208 local pattern="$5"
209
210 printf " %-35s" "$name"
211
212 # Run ferp
213 local ferp_time=$(run_timed_portable "$FERP $ferp_args '$pattern' '$file'")
214
215 # Run grep
216 local grep_time=$(run_timed_portable "$GREP $grep_args '$pattern' '$file'")
217
218 # Calculate speedup
219 local speedup=$(echo "scale=2; $grep_time / $ferp_time" | bc -l 2>/dev/null || echo "N/A")
220
221 # Store results
222 RESULT_NAMES+=("$name")
223 RESULT_FERP_TIMES+=("$ferp_time")
224 RESULT_GREP_TIMES+=("$grep_time")
225
226 # Color-code the speedup
227 local color="$NC"
228 if (( $(echo "$speedup > 1.5" | bc -l) )); then
229 color="$GREEN"
230 elif (( $(echo "$speedup < 0.8" | bc -l) )); then
231 color="$RED"
232 fi
233
234 printf "ferp: %6.3fs grep: %6.3fs ${color}%5.2fx${NC}\n" "$ferp_time" "$grep_time" "$speedup"
235 }
236
237 #------------------------------------------------------------------------------
238 # Benchmark Suites
239 #------------------------------------------------------------------------------
240
241 run_literal_benchmarks() {
242 echo -e "\n${BOLD}${BLUE}=== Literal String Matching ===${NC}"
243 local file="$BENCH_DIR/english_large.txt"
244
245 benchmark_pattern "Simple word (hello)" "$file" "" "" "hello"
246 benchmark_pattern "Common word (the)" "$file" "" "" "the"
247 benchmark_pattern "Longer phrase (quick brown)" "$file" "" "" "quick brown"
248 benchmark_pattern "Case insensitive (-i hello)" "$file" "-i" "-i" "hello"
249 benchmark_pattern "Fixed string (-F hello)" "$file" "-F" "-F" "hello"
250 benchmark_pattern "Word boundary (-w the)" "$file" "-w" "-w" "the"
251 }
252
253 run_regex_benchmarks() {
254 echo -e "\n${BOLD}${BLUE}=== Regular Expression Matching ===${NC}"
255 local file="$BENCH_DIR/english_large.txt"
256
257 benchmark_pattern "Dot wildcard (h.llo)" "$file" "" "" "h.llo"
258 benchmark_pattern "Star quantifier (hel*o)" "$file" "" "" "hel*o"
259 benchmark_pattern "Character class ([a-z]+)" "$file" "-E" "-E" "[a-z]+"
260 benchmark_pattern "Mixed class ([a-zA-Z0-9]+)" "$file" "-E" "-E" "[a-zA-Z0-9]+"
261 benchmark_pattern "Digit class ([0-9]+)" "$file" "-E" "-E" "[0-9]+"
262 benchmark_pattern "Alternation (cat|dog|fox)" "$file" "-E" "-E" "cat|dog|fox"
263 benchmark_pattern "Optional (colou?r)" "$file" "-E" "-E" "colou?r"
264 benchmark_pattern "One or more (hel+o)" "$file" "-E" "-E" "hel+o"
265 }
266
267 run_anchor_benchmarks() {
268 echo -e "\n${BOLD}${BLUE}=== Anchor Patterns ===${NC}"
269 local file="$BENCH_DIR/english_large.txt"
270
271 benchmark_pattern "Start anchor (^The)" "$file" "" "" "^The"
272 benchmark_pattern "End anchor (\\.$)" "$file" "" "" '\.$'
273 benchmark_pattern "Both anchors (^The.*dog$)" "$file" "-E" "-E" "^The.*dog$"
274 benchmark_pattern "Word start (\\<quick)" "$file" "" "" '\<quick'
275 benchmark_pattern "Word end (fox\\>)" "$file" "" "" 'fox\>'
276 }
277
278 run_log_benchmarks() {
279 echo -e "\n${BOLD}${BLUE}=== Log File Patterns ===${NC}"
280 local file="$BENCH_DIR/logs_medium.txt"
281
282 benchmark_pattern "Log level (ERROR)" "$file" "" "" "ERROR"
283 benchmark_pattern "Log level (-i warn)" "$file" "-i" "-i" "warn"
284 benchmark_pattern "Timestamp pattern ([0-9]{2}:[0-9]{2})" "$file" "-E" "-E" "[0-9]{2}:[0-9]{2}"
285 benchmark_pattern "Component (component-[0-9]+)" "$file" "-E" "-E" "component-[0-9]+"
286 benchmark_pattern "Multiple levels (ERROR|WARN)" "$file" "-E" "-E" "ERROR|WARN"
287 }
288
289 run_code_benchmarks() {
290 echo -e "\n${BOLD}${BLUE}=== Code Pattern Matching ===${NC}"
291 local file="$BENCH_DIR/code_medium.txt"
292
293 benchmark_pattern "Function name (function)" "$file" "" "" "function"
294 benchmark_pattern "Variable (const|let|var)" "$file" "-E" "-E" "const|let|var"
295 benchmark_pattern "Return statement (return)" "$file" "" "" "return"
296 benchmark_pattern "Console log (console\\.log)" "$file" "-E" "-E" "console\\.log"
297 }
298
299 run_csv_benchmarks() {
300 echo -e "\n${BOLD}${BLUE}=== CSV/Data Pattern Matching ===${NC}"
301 local file="$BENCH_DIR/data_medium.csv"
302
303 benchmark_pattern "Email pattern (@.*\\.com)" "$file" "-E" "-E" "@.*\\.com"
304 benchmark_pattern "Specific domain (domain50)" "$file" "" "" "domain50"
305 benchmark_pattern "User pattern (user_[0-9]+)" "$file" "-E" "-E" "user_[0-9]+"
306 benchmark_pattern "High score (,[89][0-9],)" "$file" "-E" "-E" ",[89][0-9],"
307 }
308
309 run_special_benchmarks() {
310 echo -e "\n${BOLD}${BLUE}=== Special Cases ===${NC}"
311 local file="$BENCH_DIR/english_large.txt"
312
313 benchmark_pattern "Invert match (-v error)" "$file" "-v" "-v" "error"
314 benchmark_pattern "Count only (-c the)" "$file" "-c" "-c" "the"
315 benchmark_pattern "Line number (-n hello)" "$file" "-n" "-n" "hello"
316 benchmark_pattern "Multiple patterns (cat|dog|bird|fish)" "$file" "-E" "-E" "cat|dog|bird|fish"
317 benchmark_pattern "Long alternation (the|and|for|with|from)" "$file" "-E" "-E" "the|and|for|with|from"
318 }
319
320 run_scaling_benchmarks() {
321 echo -e "\n${BOLD}${BLUE}=== Scaling Tests ===${NC}"
322
323 echo -e " ${CYAN}Small file (~700KB):${NC}"
324 benchmark_pattern " [a-z]+ on small" "$BENCH_DIR/english_small.txt" "-E" "-E" "[a-z]+"
325
326 echo -e " ${CYAN}Large file (~70MB):${NC}"
327 benchmark_pattern " [a-z]+ on large" "$BENCH_DIR/english_large.txt" "-E" "-E" "[a-z]+"
328
329 # Calculate scaling factor (get last two results)
330 local num_results=${#RESULT_FERP_TIMES[@]}
331 local small_ferp="${RESULT_FERP_TIMES[$((num_results-2))]}"
332 local large_ferp="${RESULT_FERP_TIMES[$((num_results-1))]}"
333 local small_grep="${RESULT_GREP_TIMES[$((num_results-2))]}"
334 local large_grep="${RESULT_GREP_TIMES[$((num_results-1))]}"
335
336 echo -e "\n ${CYAN}Scaling (large/small ratio):${NC}"
337 local ferp_scale=$(echo "scale=1; $large_ferp / $small_ferp" | bc -l 2>/dev/null || echo "N/A")
338 local grep_scale=$(echo "scale=1; $large_grep / $small_grep" | bc -l 2>/dev/null || echo "N/A")
339 echo -e " ferp: ${ferp_scale}x grep: ${grep_scale}x (lower is better for large files)"
340 }
341
342 #------------------------------------------------------------------------------
343 # Report Generation
344 #------------------------------------------------------------------------------
345
346 print_summary() {
347 echo -e "\n${BOLD}${BLUE}══════════════════════════════════════════════════════════════${NC}"
348 echo -e "${BOLD}${BLUE} SUMMARY ${NC}"
349 echo -e "${BOLD}${BLUE}══════════════════════════════════════════════════════════════${NC}"
350
351 local total_ferp=0
352 local total_grep=0
353 local wins_ferp=0
354 local wins_grep=0
355 local count=${#RESULT_NAMES[@]}
356
357 for i in "${!RESULT_NAMES[@]}"; do
358 local ft="${RESULT_FERP_TIMES[$i]}"
359 local gt="${RESULT_GREP_TIMES[$i]}"
360 total_ferp=$(echo "$total_ferp + $ft" | bc -l)
361 total_grep=$(echo "$total_grep + $gt" | bc -l)
362
363 if (( $(echo "$ft < $gt" | bc -l) )); then
364 wins_ferp=$((wins_ferp + 1))
365 else
366 wins_grep=$((wins_grep + 1))
367 fi
368 done
369
370 local avg_speedup=$(echo "scale=2; $total_grep / $total_ferp" | bc -l 2>/dev/null || echo "N/A")
371
372 echo -e "\n${CYAN}Overall Statistics:${NC}"
373 echo -e " Total benchmarks run: $count"
374 echo -e " ferp wins: ${GREEN}$wins_ferp${NC}"
375 echo -e " grep wins: ${RED}$wins_grep${NC}"
376 printf " Total time - ferp: %.3fs grep: %.3fs\n" "$total_ferp" "$total_grep"
377 echo -e " ${BOLD}Average speedup: ${GREEN}${avg_speedup}x${NC}"
378
379 echo -e "\n${CYAN}System Information:${NC}"
380 echo -e " OS: $(uname -s) $(uname -r)"
381 echo -e " CPU: $(sysctl -n machdep.cpu.brand_string 2>/dev/null || lscpu 2>/dev/null | grep 'Model name' | cut -d: -f2 | xargs || echo 'Unknown')"
382 echo -e " ferp version: $($FERP --version 2>&1 | head -1 || echo 'Unknown')"
383 echo -e " grep version: $($GREP --version 2>&1 | head -1 || echo 'Unknown')"
384 echo -e " Runs per benchmark: $RUNS (median taken)"
385
386 echo -e "\n${BOLD}${BLUE}══════════════════════════════════════════════════════════════${NC}"
387 }
388
389 #------------------------------------------------------------------------------
390 # Main
391 #------------------------------------------------------------------------------
392
393 main() {
394 echo -e "${BOLD}${BLUE}"
395 echo "╔══════════════════════════════════════════════════════════════╗"
396 echo "║ FERP vs grep Benchmark Suite ║"
397 echo "║ Comprehensive Performance Comparison ║"
398 echo "╚══════════════════════════════════════════════════════════════╝"
399 echo -e "${NC}"
400
401 check_prerequisites
402 create_test_files
403
404 echo -e "\n${BOLD}${CYAN}Running benchmarks (${RUNS} runs each, reporting median)...${NC}"
405 echo -e "${CYAN}Format: ferp time | grep time | speedup (>1 = ferp faster)${NC}\n"
406
407 run_literal_benchmarks
408 run_regex_benchmarks
409 run_anchor_benchmarks
410 run_log_benchmarks
411 run_code_benchmarks
412 run_csv_benchmarks
413 run_special_benchmarks
414 run_scaling_benchmarks
415
416 print_summary
417
418 echo -e "\n${GREEN}Benchmark complete!${NC}"
419 }
420
421 # Run with optional arguments
422 if [[ "$1" == "-h" || "$1" == "--help" ]]; then
423 echo "Usage: $0 [OPTIONS]"
424 echo ""
425 echo "Options:"
426 echo " -r, --runs N Number of runs per benchmark (default: 3)"
427 echo " -q, --quick Quick mode (smaller files, fewer runs)"
428 echo " -h, --help Show this help"
429 exit 0
430 fi
431
432 if [[ "$1" == "-q" || "$1" == "--quick" ]]; then
433 RUNS=1
434 SMALL_LINES=1000
435 MEDIUM_LINES=10000
436 LARGE_LINES=100000
437 echo -e "${YELLOW}Quick mode: reduced file sizes and single run${NC}"
438 fi
439
440 if [[ "$1" == "-r" || "$1" == "--runs" ]]; then
441 RUNS="${2:-3}"
442 fi
443
444 main