C · 3781 bytes Raw Blame History
1 /*
2 * SIMD character scanning for FERP
3 * Uses ARM NEON on Apple Silicon, scalar fallback otherwise
4 */
5
6 #include <stdint.h>
7 #include <stddef.h>
8
9 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
10 #include <arm_neon.h>
11 #define USE_NEON 1
12 #else
13 #define USE_NEON 0
14 #endif
15
16 /*
17 * Find first occurrence of character 'needle' in buffer starting at 'start'.
18 * Returns position (0-indexed) or -1 if not found.
19 * Scans in 16-byte chunks using SIMD when available.
20 */
21 int64_t simd_find_char(const char *buf, int64_t len, int64_t start, char needle) {
22 if (start >= len) return -1;
23
24 const char *p = buf + start;
25 int64_t pos = start;
26
27 #if USE_NEON
28 /* Use NEON for bulk scanning */
29 uint8x16_t vneedle = vdupq_n_u8((uint8_t)needle);
30
31 /* Align to 16-byte boundary */
32 while (pos < len && ((uintptr_t)p & 15)) {
33 if (*p == needle) return pos;
34 p++; pos++;
35 }
36
37 /* SIMD scan 16 bytes at a time */
38 while (pos + 16 <= len) {
39 uint8x16_t chunk = vld1q_u8((const uint8_t *)p);
40 uint8x16_t cmp = vceqq_u8(chunk, vneedle);
41
42 /* Check if any byte matched */
43 if (vmaxvq_u8(cmp)) {
44 /* Find which byte matched */
45 for (int i = 0; i < 16; i++) {
46 if (p[i] == needle) return pos + i;
47 }
48 }
49 p += 16;
50 pos += 16;
51 }
52 #endif
53
54 /* Scalar fallback for remainder */
55 while (pos < len) {
56 if (*p == needle) return pos;
57 p++; pos++;
58 }
59
60 return -1;
61 }
62
63 /*
64 * Find first occurrence of 2-character sequence in buffer.
65 * Returns position (0-indexed) or -1 if not found.
66 */
67 int64_t simd_find_char2(const char *buf, int64_t len, int64_t start, char c1, char c2) {
68 if (start >= len - 1) return -1;
69
70 int64_t pos = start;
71
72 #if USE_NEON
73 uint8x16_t vc1 = vdupq_n_u8((uint8_t)c1);
74 const char *p = buf + start;
75
76 /* Align to 16-byte boundary */
77 while (pos < len - 1 && ((uintptr_t)p & 15)) {
78 if (p[0] == c1 && p[1] == c2) return pos;
79 p++; pos++;
80 }
81
82 /* SIMD scan for first character */
83 while (pos + 16 <= len - 1) {
84 uint8x16_t chunk = vld1q_u8((const uint8_t *)p);
85 uint8x16_t cmp = vceqq_u8(chunk, vc1);
86
87 if (vmaxvq_u8(cmp)) {
88 /* Check each potential match */
89 for (int i = 0; i < 16 && pos + i < len - 1; i++) {
90 if (p[i] == c1 && p[i + 1] == c2) return pos + i;
91 }
92 }
93 p += 16;
94 pos += 16;
95 }
96 #endif
97
98 /* Scalar fallback */
99 const char *p2 = buf + pos;
100 while (pos < len - 1) {
101 if (p2[0] == c1 && p2[1] == c2) return pos;
102 p2++; pos++;
103 }
104
105 return -1;
106 }
107
108 /*
109 * Count occurrences of character in buffer (useful for line counting)
110 */
111 int64_t simd_count_char(const char *buf, int64_t len, char needle) {
112 int64_t count = 0;
113 const char *p = buf;
114 const char *end = buf + len;
115
116 #if USE_NEON
117 uint8x16_t vneedle = vdupq_n_u8((uint8_t)needle);
118 uint8x16_t vcount = vdupq_n_u8(0);
119 int batch = 0;
120
121 /* Align */
122 while (p < end && ((uintptr_t)p & 15)) {
123 if (*p++ == needle) count++;
124 }
125
126 /* SIMD count */
127 while (p + 16 <= end) {
128 uint8x16_t chunk = vld1q_u8((const uint8_t *)p);
129 uint8x16_t cmp = vceqq_u8(chunk, vneedle);
130 /* -1 for match, 0 for no match; negate to get 1/0 */
131 vcount = vsubq_u8(vcount, cmp);
132 p += 16;
133 batch++;
134
135 /* Prevent overflow - accumulate every 255 iterations */
136 if (batch == 255) {
137 count += vaddvq_u8(vcount);
138 vcount = vdupq_n_u8(0);
139 batch = 0;
140 }
141 }
142 count += vaddvq_u8(vcount);
143 #endif
144
145 /* Scalar remainder */
146 while (p < end) {
147 if (*p++ == needle) count++;
148 }
149
150 return count;
151 }