| 1 | /* |
| 2 | * SIMD character scanning for FERP |
| 3 | * Uses ARM NEON on Apple Silicon, scalar fallback otherwise |
| 4 | */ |
| 5 | |
| 6 | #include <stdint.h> |
| 7 | #include <stddef.h> |
| 8 | |
| 9 | #if defined(__ARM_NEON) || defined(__ARM_NEON__) |
| 10 | #include <arm_neon.h> |
| 11 | #define USE_NEON 1 |
| 12 | #else |
| 13 | #define USE_NEON 0 |
| 14 | #endif |
| 15 | |
| 16 | /* |
| 17 | * Find first occurrence of character 'needle' in buffer starting at 'start'. |
| 18 | * Returns position (0-indexed) or -1 if not found. |
| 19 | * Scans in 16-byte chunks using SIMD when available. |
| 20 | */ |
| 21 | int64_t simd_find_char(const char *buf, int64_t len, int64_t start, char needle) { |
| 22 | if (start >= len) return -1; |
| 23 | |
| 24 | const char *p = buf + start; |
| 25 | int64_t pos = start; |
| 26 | |
| 27 | #if USE_NEON |
| 28 | /* Use NEON for bulk scanning */ |
| 29 | uint8x16_t vneedle = vdupq_n_u8((uint8_t)needle); |
| 30 | |
| 31 | /* Align to 16-byte boundary */ |
| 32 | while (pos < len && ((uintptr_t)p & 15)) { |
| 33 | if (*p == needle) return pos; |
| 34 | p++; pos++; |
| 35 | } |
| 36 | |
| 37 | /* SIMD scan 16 bytes at a time */ |
| 38 | while (pos + 16 <= len) { |
| 39 | uint8x16_t chunk = vld1q_u8((const uint8_t *)p); |
| 40 | uint8x16_t cmp = vceqq_u8(chunk, vneedle); |
| 41 | |
| 42 | /* Check if any byte matched */ |
| 43 | if (vmaxvq_u8(cmp)) { |
| 44 | /* Find which byte matched */ |
| 45 | for (int i = 0; i < 16; i++) { |
| 46 | if (p[i] == needle) return pos + i; |
| 47 | } |
| 48 | } |
| 49 | p += 16; |
| 50 | pos += 16; |
| 51 | } |
| 52 | #endif |
| 53 | |
| 54 | /* Scalar fallback for remainder */ |
| 55 | while (pos < len) { |
| 56 | if (*p == needle) return pos; |
| 57 | p++; pos++; |
| 58 | } |
| 59 | |
| 60 | return -1; |
| 61 | } |
| 62 | |
| 63 | /* |
| 64 | * Find first occurrence of 2-character sequence in buffer. |
| 65 | * Returns position (0-indexed) or -1 if not found. |
| 66 | */ |
| 67 | int64_t simd_find_char2(const char *buf, int64_t len, int64_t start, char c1, char c2) { |
| 68 | if (start >= len - 1) return -1; |
| 69 | |
| 70 | int64_t pos = start; |
| 71 | |
| 72 | #if USE_NEON |
| 73 | uint8x16_t vc1 = vdupq_n_u8((uint8_t)c1); |
| 74 | const char *p = buf + start; |
| 75 | |
| 76 | /* Align to 16-byte boundary */ |
| 77 | while (pos < len - 1 && ((uintptr_t)p & 15)) { |
| 78 | if (p[0] == c1 && p[1] == c2) return pos; |
| 79 | p++; pos++; |
| 80 | } |
| 81 | |
| 82 | /* SIMD scan for first character */ |
| 83 | while (pos + 16 <= len - 1) { |
| 84 | uint8x16_t chunk = vld1q_u8((const uint8_t *)p); |
| 85 | uint8x16_t cmp = vceqq_u8(chunk, vc1); |
| 86 | |
| 87 | if (vmaxvq_u8(cmp)) { |
| 88 | /* Check each potential match */ |
| 89 | for (int i = 0; i < 16 && pos + i < len - 1; i++) { |
| 90 | if (p[i] == c1 && p[i + 1] == c2) return pos + i; |
| 91 | } |
| 92 | } |
| 93 | p += 16; |
| 94 | pos += 16; |
| 95 | } |
| 96 | #endif |
| 97 | |
| 98 | /* Scalar fallback */ |
| 99 | const char *p2 = buf + pos; |
| 100 | while (pos < len - 1) { |
| 101 | if (p2[0] == c1 && p2[1] == c2) return pos; |
| 102 | p2++; pos++; |
| 103 | } |
| 104 | |
| 105 | return -1; |
| 106 | } |
| 107 | |
| 108 | /* |
| 109 | * Count occurrences of character in buffer (useful for line counting) |
| 110 | */ |
| 111 | int64_t simd_count_char(const char *buf, int64_t len, char needle) { |
| 112 | int64_t count = 0; |
| 113 | const char *p = buf; |
| 114 | const char *end = buf + len; |
| 115 | |
| 116 | #if USE_NEON |
| 117 | uint8x16_t vneedle = vdupq_n_u8((uint8_t)needle); |
| 118 | uint8x16_t vcount = vdupq_n_u8(0); |
| 119 | int batch = 0; |
| 120 | |
| 121 | /* Align */ |
| 122 | while (p < end && ((uintptr_t)p & 15)) { |
| 123 | if (*p++ == needle) count++; |
| 124 | } |
| 125 | |
| 126 | /* SIMD count */ |
| 127 | while (p + 16 <= end) { |
| 128 | uint8x16_t chunk = vld1q_u8((const uint8_t *)p); |
| 129 | uint8x16_t cmp = vceqq_u8(chunk, vneedle); |
| 130 | /* -1 for match, 0 for no match; negate to get 1/0 */ |
| 131 | vcount = vsubq_u8(vcount, cmp); |
| 132 | p += 16; |
| 133 | batch++; |
| 134 | |
| 135 | /* Prevent overflow - accumulate every 255 iterations */ |
| 136 | if (batch == 255) { |
| 137 | count += vaddvq_u8(vcount); |
| 138 | vcount = vdupq_n_u8(0); |
| 139 | batch = 0; |
| 140 | } |
| 141 | } |
| 142 | count += vaddvq_u8(vcount); |
| 143 | #endif |
| 144 | |
| 145 | /* Scalar remainder */ |
| 146 | while (p < end) { |
| 147 | if (*p++ == needle) count++; |
| 148 | } |
| 149 | |
| 150 | return count; |
| 151 | } |