@@ -0,0 +1,404 @@ |
| | 1 | +#!/usr/bin/env python3 |
| | 2 | +# |
| | 3 | +# Hi; I know this is inefficient; |
| | 4 | +# Hi; I know it even hangs on run; |
| | 5 | +# Hi; You know this is insufficient; |
| | 6 | +# Hey, but you know it runs! |
| | 7 | +# |
| | 8 | +# |
| | 9 | +import argparse |
| | 10 | +import cv2 |
| | 11 | +import numpy as np |
| | 12 | +import pyaudio |
| | 13 | +import wave |
| | 14 | +import os |
| | 15 | +from collections import deque |
| | 16 | +import time |
| | 17 | + |
| | 18 | +# for convolution and threading |
| | 19 | +# |
| | 20 | +from scipy.signal import fftconvolve |
| | 21 | +from concurrent.futures import ThreadPoolExecutor |
| | 22 | + |
| | 23 | +import soundfile as sf |
| | 24 | + |
| | 25 | +from gesture_dsp.dsp_effects import ( |
| | 26 | + mid_side, |
| | 27 | + convolution_reverb, |
| | 28 | + bitcrush, |
| | 29 | + filters, |
| | 30 | + spectral_freeze, |
| | 31 | + delay, |
| | 32 | + pitch_shift, |
| | 33 | +) |
| | 34 | + |
| | 35 | + |
| | 36 | +def parse_args(): |
| | 37 | + parser = argparse.ArgumentParser( |
| | 38 | + description="Gesture-based DSP demo: apply DSP effects via camera gestures." |
| | 39 | + ) |
| | 40 | + parser.add_argument( |
| | 41 | + "wav", |
| | 42 | + help="Path to input stereo WAV file" |
| | 43 | + ) |
| | 44 | + parser.add_argument( |
| | 45 | + "--ir", |
| | 46 | + help="Path to impulse response WAV file (for convolution reverb)", |
| | 47 | + default=None |
| | 48 | + ) |
| | 49 | + return parser.parse_args() |
| | 50 | + |
| | 51 | + |
| | 52 | +# ::: |
| | 53 | +# :::: WRAPPERS FOR THE FO(UR)TRAN MODULES :::: |
| | 54 | +# ::::: ::::::::::::::::::::::::::::::::::: ::::: |
| | 55 | +# |
| | 56 | +# NOTE: most of the f2-y wrappers want outbuffa as return |
| | 57 | +# |
| | 58 | + |
| | 59 | +def apply_mid_side(audio_in: np.ndarray, side_gain: float) -> np.ndarray: |
| | 60 | + n = audio_in.size // 2 |
| | 61 | + out = np.empty_like(audio_in, dtype=np.float64) |
| | 62 | + out = mid_side.mid_side(audio_in, side_gain, n) |
| | 63 | + return out |
| | 64 | + |
| | 65 | + |
| | 66 | +def apply_bitcrush(audio_in: np.ndarray, bit_depth: int = 8) -> np.ndarray: |
| | 67 | + out = bitcrush.bitcrush_mod.bitcrush( |
| | 68 | + audio_in, |
| | 69 | + bit_depth |
| | 70 | + ) |
| | 71 | + return out |
| | 72 | + |
| | 73 | + |
| | 74 | +def apply_lp_filter(audio_in: np.ndarray, cutoff: float, fs: float) -> np.ndarray: |
| | 75 | + out = filters.filters_mod.lp_filter(audio_in, cutoff, fs) |
| | 76 | + return out |
| | 77 | + |
| | 78 | + |
| | 79 | +def apply_hp_filter(audio_in: np.ndarray, cutoff: float, fs: float) -> np.ndarray: |
| | 80 | + out = filters.filters_mod.hp_filter(audio_in, cutoff, fs) |
| | 81 | + return out |
| | 82 | + |
| | 83 | + |
| | 84 | +def apply_delay(audio_in: np.ndarray, delay_samps: int = 4410, feedback: float = 0.5) -> np.ndarray: |
| | 85 | + out = delay.delay_mod.delay(audio_in, delay_samps, feedback) |
| | 86 | + return out |
| | 87 | + |
| | 88 | + |
| | 89 | +def apply_convolution_reverb(audio_in: np.ndarray, ir: np.ndarray) -> np.ndarray: |
| | 90 | + return fftconvolve(audio_in, ir, mode='same') |
| | 91 | + |
| | 92 | + |
| | 93 | +def apply_spectral_freeze(audio_in: np.ndarray) -> np.ndarray: |
| | 94 | + n = audio_in.size |
| | 95 | + out = np.empty(n, dtype=np.float64) |
| | 96 | + out = spectral_freeze.spectral_freeze_mod.spectral_freeze(audio_in, n) |
| | 97 | + return out |
| | 98 | + |
| | 99 | + |
| | 100 | +def apply_pitch_shift(audio_in: np.ndarray, semitones: int = 4) -> np.ndarray: |
| | 101 | + n = audio_in.size |
| | 102 | + out = np.empty(n, dtype=np.float64) |
| | 103 | + out = pitch_shift.pitch_shift_mod.pitch_shift(audio_in, semitones) |
| | 104 | + return out |
| | 105 | + |
| | 106 | + |
| | 107 | +# effect name -> (callable, default params) |
| | 108 | +EFFECTS = [ |
| | 109 | + ("mid_side", lambda buf, fs, ir, p: apply_mid_side(buf, side_gain=min(max(p*2.0, 0.0), 2.0))), |
| | 110 | + ("bitcrush", lambda buf, fs, ir, p: apply_bitcrush(buf, bit_depth=int((p**3)*32)+1)), |
| | 111 | + ("lowpass", lambda buf, fs, ir, p: apply_lp_filter(buf, cutoff=p*6000+400, fs=fs)), |
| | 112 | + ("highpass", lambda buf, fs, ir, p: apply_hp_filter(buf, cutoff=p*6000+200, fs=fs)), |
| | 113 | + ("delay", lambda buf, fs, ir, p: apply_delay(buf, delay_samps=int(fs * (0.2 + 0.6 * p)), feedback=0.3 + (0.6 * p))), |
| | 114 | + ("reverb", lambda buf, fs, ir, p: apply_convolution_reverb(buf, ir)), |
| | 115 | + ("spectral_freeze", lambda buf, fs, ir, p: apply_spectral_freeze(buf)), |
| | 116 | + ("pitch_shift", lambda buf, fs, ir, p: apply_pitch_shift(buf, semitones=int(p*60-30))), |
| | 117 | +] |
| | 118 | +EFFECT_NAMES = [name for name, _ in EFFECTS] |
| | 119 | + |
| | 120 | + |
| | 121 | +class GestureDSP: |
| | 122 | + def __init__(self, wav_path: str, ir_path: str = None): |
| | 123 | + |
| | 124 | + self.wf = wave.open(wav_path, 'rb') |
| | 125 | + assert self.wf.getnchannels() == 2, "Need stereo WAV" |
| | 126 | + self.fs = self.wf.getframerate() |
| | 127 | + self.p = pyaudio.PyAudio() |
| | 128 | + |
| | 129 | + self.frames_per_buffer = 4096 |
| | 130 | + self.stream = self.p.open( |
| | 131 | + format=pyaudio.paInt16, |
| | 132 | + channels=2, |
| | 133 | + rate=self.fs, |
| | 134 | + output=True, |
| | 135 | + frames_per_buffer=self.frames_per_buffer, |
| | 136 | + stream_callback=self._callback |
| | 137 | + ) |
| | 138 | + |
| | 139 | + num_channels = self.wf.getnchannels() |
| | 140 | + max_samples = self.frames_per_buffer * num_channels |
| | 141 | + self._in_buffer = np.empty(max_samples, dtype=np.float64) |
| | 142 | + self._f64_clip_buffer = np.empty(max_samples, dtype=np.float64) |
| | 143 | + self._int16_out_buffer = np.empty(max_samples, dtype=np.int16) |
| | 144 | + |
| | 145 | + # load IR if provided |
| | 146 | + if ir_path and os.path.exists(ir_path): |
| | 147 | + try: |
| | 148 | + with wave.open(ir_path, 'rb') as ir_wf: |
| | 149 | + data = ir_wf.readframes(ir_wf.getnframes()) |
| | 150 | + self.ir = np.frombuffer(data, dtype=np.int16).astype(np.float64) |
| | 151 | + except wave.Error: |
| | 152 | + # fallback for non-PCM WAV formats |
| | 153 | + data, sr = sf.read(ir_path, dtype='float64') |
| | 154 | + |
| | 155 | + # flatten multi-channel IR |
| | 156 | + if data.ndim > 1: |
| | 157 | + data = data.flatten() |
| | 158 | + |
| | 159 | + # convert float [-1,1] to int16 range |
| | 160 | + self.ir = (data * 32767).astype(np.float64) |
| | 161 | + else: |
| | 162 | + self.ir = np.zeros(1, dtype=np.float64) |
| | 163 | + |
| | 164 | + # out the getgo with number one (or zero if that |
| | 165 | + # floats your boat...) |
| | 166 | + self.current_idx = 0 |
| | 167 | + self.current_param = 0.5 |
| | 168 | + |
| | 169 | + def _callback(self, in_data, frame_count, time_info, status): |
| | 170 | + raw = self.wf.readframes(frame_count) |
| | 171 | + if not raw: |
| | 172 | + return (raw, pyaudio.paComplete) |
| | 173 | + |
| | 174 | + # convert int16 input into pre-allocated float buffer |
| | 175 | + in_int16 = np.frombuffer(raw, dtype=np.int16) |
| | 176 | + self._in_buffer[:in_int16.size] = in_int16.astype(np.float64) |
| | 177 | + |
| | 178 | + # apply DSP |
| | 179 | + _, func = EFFECTS[self.current_idx] |
| | 180 | + audio_out = func(self._in_buffer, self.fs, self.ir, self.current_param) |
| | 181 | + |
| | 182 | + # clip and convert to int16 into pre-allocated buffer |
| | 183 | + np.clip(audio_out, -32768, 32767, out=self._f64_clip_buffer) |
| | 184 | + self._int16_out_buffer[:audio_out.size] = self._f64_clip_buffer.astype(np.int16) |
| | 185 | + |
| | 186 | + return (self._int16_out_buffer.tobytes(), pyaudio.paContinue) |
| | 187 | + |
| | 188 | + def start(self): |
| | 189 | + self.stream.start_stream() |
| | 190 | + |
| | 191 | + def stop(self): |
| | 192 | + self.stream.stop_stream() |
| | 193 | + self.stream.close() |
| | 194 | + self.wf.close() |
| | 195 | + self.p.terminate() |
| | 196 | + |
| | 197 | + def set_effect_by_zone(self, x_center: float, frame_w: int): |
| | 198 | + """Divide frame into N vertical zones to pick effect.""" |
| | 199 | + zone_width = frame_w / len(EFFECT_NAMES) |
| | 200 | + idx = int(x_center // zone_width) |
| | 201 | + self.current_idx = max(0, min(idx, len(EFFECT_NAMES)-1)) |
| | 202 | + |
| | 203 | + |
| | 204 | +# ::: |
| | 205 | +# :::: HAND-DETECTION HELPERS :::: |
| | 206 | +# ::::: :::::::::::::::::::::: ::::: |
| | 207 | +# |
| | 208 | + |
| | 209 | +# god this is ugly; |
| | 210 | +# god; forgive me; |
| | 211 | +# god; and I don't |
| | 212 | +# even believe in you; |
| | 213 | +# ; |
| | 214 | +def is_hand_raised(frame): |
| | 215 | + blur = cv2.GaussianBlur(frame, (5, 5), 0) |
| | 216 | + hsv = cv2.cvtColor(blur, cv2.COLOR_BGR2HSV) |
| | 217 | + lb1, ub1 = np.array([140,50,50]), np.array([179,255,255]) |
| | 218 | + lb2, ub2 = np.array([0,50,50]), np.array([10,255,255]) |
| | 219 | + m1 = cv2.inRange(hsv, lb1, ub1) |
| | 220 | + m2 = cv2.inRange(hsv, lb2, ub2) |
| | 221 | + mask = cv2.morphologyEx( |
| | 222 | + cv2.bitwise_or(m1, m2), |
| | 223 | + cv2.MORPH_OPEN, |
| | 224 | + np.ones((2,2), np.uint8) |
| | 225 | + ) |
| | 226 | + |
| | 227 | + cnts, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
| | 228 | + if not cnts: |
| | 229 | + return None |
| | 230 | + |
| | 231 | + c = max(cnts, key=cv2.contourArea) |
| | 232 | + if cv2.contourArea(c) < 8: |
| | 233 | + return None |
| | 234 | + |
| | 235 | + x,y,w,h = cv2.boundingRect(c) |
| | 236 | + if (y + h/2) < frame.shape[0] * 0.70: |
| | 237 | + return (x, y, w, h) |
| | 238 | + return None |
| | 239 | + |
| | 240 | +def is_second_glove_raised(frame): |
| | 241 | + blur = cv2.GaussianBlur(frame, (5, 5), 0) |
| | 242 | + hsv = cv2.cvtColor(blur, cv2.COLOR_BGR2HSV) |
| | 243 | + lb = np.array([35, 50, 50]) |
| | 244 | + ub = np.array([85, 255, 255]) |
| | 245 | + mask = cv2.inRange(hsv, lb, ub) |
| | 246 | + mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, np.ones((2,2), np.uint8)) |
| | 247 | + |
| | 248 | + cnts, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
| | 249 | + if not cnts: |
| | 250 | + return None |
| | 251 | + |
| | 252 | + # pick the largest contour to avoid multiple detections |
| | 253 | + c = max(cnts, key=cv2.contourArea) |
| | 254 | + if cv2.contourArea(c) < 100: |
| | 255 | + return None |
| | 256 | + |
| | 257 | + x, y, w, h = cv2.boundingRect(c) |
| | 258 | + return (x, y, w, h) |
| | 259 | + |
| | 260 | + |
| | 261 | +# ::: |
| | 262 | +# :::: MAIN LOOP :::: |
| | 263 | +# ::::: ::::::::: ::::: |
| | 264 | +# |
| | 265 | + |
| | 266 | +def main(): |
| | 267 | + args = parse_args() |
| | 268 | + |
| | 269 | + cap = cv2.VideoCapture(0) |
| | 270 | + if not cap.isOpened(): |
| | 271 | + print("🔴 Cannot open camera") |
| | 272 | + return |
| | 273 | + |
| | 274 | + # offload hand-detection to a worker thread |
| | 275 | + # not a clue if I'm doing this right... |
| | 276 | + # ...yet! |
| | 277 | + # this ain't the therac, chill with your thread shrieks |
| | 278 | + executor = ThreadPoolExecutor(max_workers=1) |
| | 279 | + prev_future = None |
| | 280 | + |
| | 281 | + # parameter window thresholds for mapping glove height |
| | 282 | + param_min_ratio = 0.2 |
| | 283 | + param_max_ratio = 0.8 |
| | 284 | + |
| | 285 | + # smoothing factor for parameter changes |
| | 286 | + param_smooth = 0.2 |
| | 287 | + smoothed_param = 0.0 |
| | 288 | + |
| | 289 | + player = GestureDSP(args.wav, ir_path=args.ir) |
| | 290 | + player.start() |
| | 291 | + |
| | 292 | + # track recent effect indices for cue detection |
| | 293 | + history = deque(maxlen=10) |
| | 294 | + cue_mode = False |
| | 295 | + |
| | 296 | + # empty precompute cues |
| | 297 | + # idk what i want to do with this realy |
| | 298 | + precomputed_cues = {} |
| | 299 | + last_cue_time = 0 |
| | 300 | + |
| | 301 | + try: |
| | 302 | + while True: |
| | 303 | + ret, frame = cap.read() |
| | 304 | + if not ret: |
| | 305 | + break |
| | 306 | + frame = cv2.flip(frame, 1) |
| | 307 | + |
| | 308 | + # schedule detection off the main thread |
| | 309 | + future = executor.submit(lambda f: (is_hand_raised(f), is_second_glove_raised(f)), frame.copy()) |
| | 310 | + if prev_future and prev_future.done(): |
| | 311 | + hand_box, glove_box = prev_future.result() |
| | 312 | + else: |
| | 313 | + hand_box, glove_box = None, None |
| | 314 | + prev_future = future |
| | 315 | + |
| | 316 | + if hand_box: |
| | 317 | + x, y, w, h = hand_box |
| | 318 | + cx = x + w/2 |
| | 319 | + else: |
| | 320 | + cx = None |
| | 321 | + if glove_box: |
| | 322 | + x2, y2, w2, h2 = glove_box |
| | 323 | + cy = y2 + h2/2 |
| | 324 | + else: |
| | 325 | + cy = None |
| | 326 | + |
| | 327 | + if cx is not None: |
| | 328 | + # effect selection |
| | 329 | + player.set_effect_by_zone(cx, frame.shape[1]) |
| | 330 | + effect_idx = player.current_idx |
| | 331 | + effect_name = EFFECT_NAMES[effect_idx] |
| | 332 | + history.append(effect_idx) |
| | 333 | + else: |
| | 334 | + effect_name = "none" |
| | 335 | + |
| | 336 | + # map second glove vertical pos to parameter with clamping and smoothing |
| | 337 | + if cy is not None: |
| | 338 | + h = frame.shape[0] |
| | 339 | + min_y = param_min_ratio * h |
| | 340 | + max_y = param_max_ratio * h |
| | 341 | + if cy <= min_y: |
| | 342 | + raw_param = 1.0 |
| | 343 | + elif cy >= max_y: |
| | 344 | + raw_param = 0.0 |
| | 345 | + else: |
| | 346 | + raw_param = 1.0 - (cy - min_y) / (max_y - min_y) |
| | 347 | + else: |
| | 348 | + raw_param = 0.5 |
| | 349 | + |
| | 350 | + # exponential smoothing |
| | 351 | + smoothed_param = param_smooth * smoothed_param + (1.0 - param_smooth) * raw_param |
| | 352 | + player.current_param = smoothed_param |
| | 353 | + |
| | 354 | + # detect simple cue: same |
| | 355 | + # effect selected repeatedly quickly |
| | 356 | + # NOTE: I hate this whole thing now |
| | 357 | + # |
| | 358 | + now = time.time() |
| | 359 | + if len(history) == history.maxlen and len(set(history)) == 1 and now - last_cue_time > 2: |
| | 360 | + cue_mode = True |
| | 361 | + last_cue_time = now |
| | 362 | + else: |
| | 363 | + cue_mode = False |
| | 364 | + |
| | 365 | + if True: |
| | 366 | + # draw detection rectangles |
| | 367 | + if hand_box: |
| | 368 | + x, y, w, h = hand_box |
| | 369 | + cv2.rectangle(frame, (x, y), (x+w, y+h), (0,255,0), 2) |
| | 370 | + if glove_box: |
| | 371 | + x2, y2, w2, h2 = glove_box |
| | 372 | + cv2.rectangle(frame, (x2, y2), (x2+w2, y2+h2), (180,105,255), 2) |
| | 373 | + |
| | 374 | + # draw effect name |
| | 375 | + cv2.putText(frame, f"Effect: {effect_name}", (10,30), |
| | 376 | + cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2) |
| | 377 | + |
| | 378 | + # draw paramabar on right |
| | 379 | + ph = int(player.current_param * frame.shape[0]) |
| | 380 | + cv2.rectangle(frame, |
| | 381 | + (frame.shape[1]-50, frame.shape[0]), |
| | 382 | + (frame.shape[1]-10, frame.shape[0]-ph), |
| | 383 | + (255,0,0), -1) |
| | 384 | + |
| | 385 | + # cue overlay |
| | 386 | + if cue_mode: |
| | 387 | + cv2.putText(frame, "CUE TRIGGERED", (frame.shape[1]//2-100,50), |
| | 388 | + cv2.FONT_HERSHEY_COMPLEX, 1.5, (0,0,255), 3) |
| | 389 | + |
| | 390 | + cv2.imshow("Gesture-DSP Demo", frame) |
| | 391 | + key = cv2.waitKey(1) & 0xFF |
| | 392 | + if key in (ord('q'), 27): |
| | 393 | + break |
| | 394 | + if not player.stream.is_active(): |
| | 395 | + break |
| | 396 | + |
| | 397 | + finally: |
| | 398 | + player.stop() |
| | 399 | + cap.release() |
| | 400 | + cv2.destroyAllWindows() |
| | 401 | + |
| | 402 | + |
| | 403 | +if __name__ == "__main__": |
| | 404 | + main() |