@@ -0,0 +1,489 @@ |
| 1 | +//! Content addressing system for ZephyrFS |
| 2 | +//! |
| 3 | +//! Provides cryptographic content identifiers and verification using Blake3 and SHA-256. |
| 4 | +//! Content IDs are used for integrity verification and deduplication. |
| 5 | + |
| 6 | +use crate::crypto::{ContentHasher, VerificationHasher}; |
| 7 | +use blake3::Hasher as Blake3Hasher; |
| 8 | +use serde::{Deserialize, Serialize}; |
| 9 | +use sha2::{Digest, Sha256}; |
| 10 | +use std::fmt; |
| 11 | + |
| 12 | +/// Content identifier - cryptographic hash of data |
| 13 | +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] |
| 14 | +pub struct ContentId { |
| 15 | + /// Hash algorithm used |
| 16 | + pub algorithm: HashAlgorithm, |
| 17 | + /// Hash bytes |
| 18 | + pub hash: Vec<u8>, |
| 19 | +} |
| 20 | + |
| 21 | +/// Supported hash algorithms |
| 22 | +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] |
| 23 | +pub enum HashAlgorithm { |
| 24 | + Blake3, |
| 25 | + Sha256, |
| 26 | +} |
| 27 | + |
| 28 | +impl ContentId { |
| 29 | + /// Generate content ID from data |
| 30 | + pub fn generate(data: &[u8], hasher: &ContentHasher) -> Self { |
| 31 | + match hasher { |
| 32 | + ContentHasher::Blake3 => { |
| 33 | + let mut hasher = Blake3Hasher::new(); |
| 34 | + hasher.update(data); |
| 35 | + let hash = hasher.finalize(); |
| 36 | + |
| 37 | + Self { |
| 38 | + algorithm: HashAlgorithm::Blake3, |
| 39 | + hash: hash.as_bytes().to_vec(), |
| 40 | + } |
| 41 | + } |
| 42 | + ContentHasher::Sha256 => { |
| 43 | + let mut hasher = Sha256::new(); |
| 44 | + hasher.update(data); |
| 45 | + let hash = hasher.finalize(); |
| 46 | + |
| 47 | + Self { |
| 48 | + algorithm: HashAlgorithm::Sha256, |
| 49 | + hash: hash.to_vec(), |
| 50 | + } |
| 51 | + } |
| 52 | + } |
| 53 | + } |
| 54 | + |
| 55 | + /// Create from existing hash bytes and algorithm |
| 56 | + pub fn from_hash(algorithm: HashAlgorithm, hash: Vec<u8>) -> Self { |
| 57 | + Self { algorithm, hash } |
| 58 | + } |
| 59 | + |
| 60 | + /// Get hash bytes |
| 61 | + pub fn hash_bytes(&self) -> &[u8] { |
| 62 | + &self.hash |
| 63 | + } |
| 64 | + |
| 65 | + /// Get algorithm used |
| 66 | + pub fn algorithm(&self) -> &HashAlgorithm { |
| 67 | + &self.algorithm |
| 68 | + } |
| 69 | + |
| 70 | + /// Convert to hex string representation |
| 71 | + pub fn to_hex(&self) -> String { |
| 72 | + hex::encode(&self.hash) |
| 73 | + } |
| 74 | + |
| 75 | + /// Create from hex string |
| 76 | + pub fn from_hex(algorithm: HashAlgorithm, hex_str: &str) -> Result<Self, hex::FromHexError> { |
| 77 | + let hash = hex::decode(hex_str)?; |
| 78 | + Ok(Self { algorithm, hash }) |
| 79 | + } |
| 80 | + |
| 81 | + /// Get expected hash length for algorithm |
| 82 | + pub fn expected_length(&self) -> usize { |
| 83 | + match self.algorithm { |
| 84 | + HashAlgorithm::Blake3 => 32, // Blake3 output is 32 bytes |
| 85 | + HashAlgorithm::Sha256 => 32, // SHA-256 output is 32 bytes |
| 86 | + } |
| 87 | + } |
| 88 | + |
| 89 | + /// Validate hash length matches algorithm |
| 90 | + pub fn is_valid(&self) -> bool { |
| 91 | + self.hash.len() == self.expected_length() |
| 92 | + } |
| 93 | + |
| 94 | + /// Create a multihash-style prefix for the content ID |
| 95 | + pub fn multihash_prefix(&self) -> Vec<u8> { |
| 96 | + let mut prefix = Vec::new(); |
| 97 | + |
| 98 | + // Add algorithm identifier |
| 99 | + match self.algorithm { |
| 100 | + HashAlgorithm::Blake3 => { |
| 101 | + prefix.push(0x1e); // Blake3 multicodec |
| 102 | + prefix.push(32); // Hash length |
| 103 | + } |
| 104 | + HashAlgorithm::Sha256 => { |
| 105 | + prefix.push(0x12); // SHA-256 multicodec |
| 106 | + prefix.push(32); // Hash length |
| 107 | + } |
| 108 | + } |
| 109 | + |
| 110 | + prefix.extend_from_slice(&self.hash); |
| 111 | + prefix |
| 112 | + } |
| 113 | + |
| 114 | + /// Parse from multihash format |
| 115 | + pub fn from_multihash(data: &[u8]) -> Result<Self, String> { |
| 116 | + if data.len() < 2 { |
| 117 | + return Err("Multihash too short".to_string()); |
| 118 | + } |
| 119 | + |
| 120 | + let algorithm = match data[0] { |
| 121 | + 0x1e => HashAlgorithm::Blake3, |
| 122 | + 0x12 => HashAlgorithm::Sha256, |
| 123 | + _ => return Err(format!("Unsupported hash algorithm: {}", data[0])), |
| 124 | + }; |
| 125 | + |
| 126 | + let expected_len = data[1] as usize; |
| 127 | + if data.len() != expected_len + 2 { |
| 128 | + return Err(format!( |
| 129 | + "Invalid multihash length: expected {}, got {}", |
| 130 | + expected_len + 2, |
| 131 | + data.len() |
| 132 | + )); |
| 133 | + } |
| 134 | + |
| 135 | + let hash = data[2..].to_vec(); |
| 136 | + let content_id = Self { algorithm, hash }; |
| 137 | + |
| 138 | + if !content_id.is_valid() { |
| 139 | + return Err("Invalid hash length for algorithm".to_string()); |
| 140 | + } |
| 141 | + |
| 142 | + Ok(content_id) |
| 143 | + } |
| 144 | +} |
| 145 | + |
| 146 | +impl fmt::Display for ContentId { |
| 147 | + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 148 | + write!(f, "{}:{}", |
| 149 | + match self.algorithm { |
| 150 | + HashAlgorithm::Blake3 => "blake3", |
| 151 | + HashAlgorithm::Sha256 => "sha256", |
| 152 | + }, |
| 153 | + self.to_hex() |
| 154 | + ) |
| 155 | + } |
| 156 | +} |
| 157 | + |
| 158 | +/// Content verification utilities |
| 159 | +pub struct ContentVerifier; |
| 160 | + |
| 161 | +impl ContentVerifier { |
| 162 | + /// Verify data matches expected content ID |
| 163 | + pub fn verify(data: &[u8], expected_id: &ContentId, hasher: &VerificationHasher) -> bool { |
| 164 | + let computed_id = match hasher { |
| 165 | + VerificationHasher::Blake3 => ContentId::generate(data, &ContentHasher::Blake3), |
| 166 | + VerificationHasher::Sha256 => ContentId::generate(data, &ContentHasher::Sha256), |
| 167 | + }; |
| 168 | + |
| 169 | + // Must use same algorithm and hash must match |
| 170 | + computed_id.algorithm == expected_id.algorithm && |
| 171 | + computed_id.hash == expected_id.hash |
| 172 | + } |
| 173 | + |
| 174 | + /// Verify data with multiple hash algorithms (for maximum security) |
| 175 | + pub fn verify_multi(data: &[u8], expected_ids: &[ContentId]) -> bool { |
| 176 | + for expected_id in expected_ids { |
| 177 | + let hasher = match expected_id.algorithm { |
| 178 | + HashAlgorithm::Blake3 => VerificationHasher::Blake3, |
| 179 | + HashAlgorithm::Sha256 => VerificationHasher::Sha256, |
| 180 | + }; |
| 181 | + |
| 182 | + if !Self::verify(data, expected_id, &hasher) { |
| 183 | + return false; |
| 184 | + } |
| 185 | + } |
| 186 | + true |
| 187 | + } |
| 188 | + |
| 189 | + /// Compute multiple hashes for data (Blake3 + SHA-256 for maximum security) |
| 190 | + pub fn compute_multi_hash(data: &[u8]) -> Vec<ContentId> { |
| 191 | + vec![ |
| 192 | + ContentId::generate(data, &ContentHasher::Blake3), |
| 193 | + ContentId::generate(data, &ContentHasher::Sha256), |
| 194 | + ] |
| 195 | + } |
| 196 | + |
| 197 | + /// Create integrity proof for data chunk |
| 198 | + pub fn create_integrity_proof(data: &[u8]) -> IntegrityProof { |
| 199 | + IntegrityProof { |
| 200 | + blake3_hash: ContentId::generate(data, &ContentHasher::Blake3), |
| 201 | + sha256_hash: ContentId::generate(data, &ContentHasher::Sha256), |
| 202 | + data_length: data.len() as u64, |
| 203 | + } |
| 204 | + } |
| 205 | + |
| 206 | + /// Verify data against integrity proof |
| 207 | + pub fn verify_integrity_proof(data: &[u8], proof: &IntegrityProof) -> bool { |
| 208 | + if data.len() as u64 != proof.data_length { |
| 209 | + return false; |
| 210 | + } |
| 211 | + |
| 212 | + let blake3_ok = Self::verify(data, &proof.blake3_hash, &VerificationHasher::Blake3); |
| 213 | + let sha256_ok = Self::verify(data, &proof.sha256_hash, &VerificationHasher::Sha256); |
| 214 | + |
| 215 | + blake3_ok && sha256_ok |
| 216 | + } |
| 217 | +} |
| 218 | + |
| 219 | +/// Integrity proof containing multiple hashes for maximum security |
| 220 | +#[derive(Debug, Clone, Serialize, Deserialize)] |
| 221 | +pub struct IntegrityProof { |
| 222 | + pub blake3_hash: ContentId, |
| 223 | + pub sha256_hash: ContentId, |
| 224 | + pub data_length: u64, |
| 225 | +} |
| 226 | + |
| 227 | +impl IntegrityProof { |
| 228 | + /// Get the primary content ID (Blake3 for performance) |
| 229 | + pub fn primary_id(&self) -> &ContentId { |
| 230 | + &self.blake3_hash |
| 231 | + } |
| 232 | + |
| 233 | + /// Get the secondary content ID (SHA-256 for compatibility) |
| 234 | + pub fn secondary_id(&self) -> &ContentId { |
| 235 | + &self.sha256_hash |
| 236 | + } |
| 237 | + |
| 238 | + /// Convert to multihash format (primary hash only) |
| 239 | + pub fn to_multihash(&self) -> Vec<u8> { |
| 240 | + self.blake3_hash.multihash_prefix() |
| 241 | + } |
| 242 | +} |
| 243 | + |
| 244 | +/// Content addressing utilities for file chunks |
| 245 | +pub struct ChunkAddressing; |
| 246 | + |
| 247 | +impl ChunkAddressing { |
| 248 | + /// Generate content ID for a file chunk with metadata |
| 249 | + pub fn chunk_id(chunk_data: &[u8], chunk_index: u64, file_id: &[u8]) -> ContentId { |
| 250 | + let mut hasher = Blake3Hasher::new(); |
| 251 | + |
| 252 | + // Include chunk metadata in hash for uniqueness |
| 253 | + hasher.update(b"ZephyrFS-chunk-v1:"); |
| 254 | + hasher.update(&chunk_index.to_be_bytes()); |
| 255 | + hasher.update(b":"); |
| 256 | + hasher.update(file_id); |
| 257 | + hasher.update(b":"); |
| 258 | + hasher.update(chunk_data); |
| 259 | + |
| 260 | + let hash = hasher.finalize(); |
| 261 | + ContentId { |
| 262 | + algorithm: HashAlgorithm::Blake3, |
| 263 | + hash: hash.as_bytes().to_vec(), |
| 264 | + } |
| 265 | + } |
| 266 | + |
| 267 | + /// Generate file-level content ID from chunk IDs |
| 268 | + pub fn file_id_from_chunks(chunk_ids: &[ContentId]) -> ContentId { |
| 269 | + let mut hasher = Blake3Hasher::new(); |
| 270 | + hasher.update(b"ZephyrFS-file-v1:"); |
| 271 | + |
| 272 | + for chunk_id in chunk_ids { |
| 273 | + hasher.update(&chunk_id.hash); |
| 274 | + } |
| 275 | + |
| 276 | + let hash = hasher.finalize(); |
| 277 | + ContentId { |
| 278 | + algorithm: HashAlgorithm::Blake3, |
| 279 | + hash: hash.as_bytes().to_vec(), |
| 280 | + } |
| 281 | + } |
| 282 | + |
| 283 | + /// Create Merkle tree root from chunk hashes |
| 284 | + pub fn merkle_root(chunk_ids: &[ContentId]) -> ContentId { |
| 285 | + if chunk_ids.is_empty() { |
| 286 | + // Empty file hash |
| 287 | + return ContentId::generate(b"", &ContentHasher::Blake3); |
| 288 | + } |
| 289 | + |
| 290 | + if chunk_ids.len() == 1 { |
| 291 | + return chunk_ids[0].clone(); |
| 292 | + } |
| 293 | + |
| 294 | + // Build Merkle tree bottom-up |
| 295 | + let mut level: Vec<ContentId> = chunk_ids.to_vec(); |
| 296 | + |
| 297 | + while level.len() > 1 { |
| 298 | + let mut next_level = Vec::new(); |
| 299 | + |
| 300 | + for pair in level.chunks(2) { |
| 301 | + let mut hasher = Blake3Hasher::new(); |
| 302 | + hasher.update(b"ZephyrFS-merkle-v1:"); |
| 303 | + hasher.update(&pair[0].hash); |
| 304 | + |
| 305 | + if pair.len() == 2 { |
| 306 | + hasher.update(&pair[1].hash); |
| 307 | + } else { |
| 308 | + // Odd number of nodes - hash with itself |
| 309 | + hasher.update(&pair[0].hash); |
| 310 | + } |
| 311 | + |
| 312 | + let hash = hasher.finalize(); |
| 313 | + next_level.push(ContentId { |
| 314 | + algorithm: HashAlgorithm::Blake3, |
| 315 | + hash: hash.as_bytes().to_vec(), |
| 316 | + }); |
| 317 | + } |
| 318 | + |
| 319 | + level = next_level; |
| 320 | + } |
| 321 | + |
| 322 | + level.into_iter().next().unwrap() |
| 323 | + } |
| 324 | +} |
| 325 | + |
| 326 | +#[cfg(test)] |
| 327 | +mod tests { |
| 328 | + use super::*; |
| 329 | + |
| 330 | + #[test] |
| 331 | + fn test_content_id_generation() { |
| 332 | + let data = b"Hello, ZephyrFS!"; |
| 333 | + |
| 334 | + let blake3_id = ContentId::generate(data, &ContentHasher::Blake3); |
| 335 | + let sha256_id = ContentId::generate(data, &ContentHasher::Sha256); |
| 336 | + |
| 337 | + assert_eq!(blake3_id.algorithm, HashAlgorithm::Blake3); |
| 338 | + assert_eq!(sha256_id.algorithm, HashAlgorithm::Sha256); |
| 339 | + assert_eq!(blake3_id.hash.len(), 32); |
| 340 | + assert_eq!(sha256_id.hash.len(), 32); |
| 341 | + assert_ne!(blake3_id.hash, sha256_id.hash); |
| 342 | + } |
| 343 | + |
| 344 | + #[test] |
| 345 | + fn test_content_verification() { |
| 346 | + let data = b"Test data for verification"; |
| 347 | + let content_id = ContentId::generate(data, &ContentHasher::Blake3); |
| 348 | + |
| 349 | + // Correct data should verify |
| 350 | + assert!(ContentVerifier::verify(data, &content_id, &VerificationHasher::Blake3)); |
| 351 | + |
| 352 | + // Wrong data should not verify |
| 353 | + let wrong_data = b"Wrong data"; |
| 354 | + assert!(!ContentVerifier::verify(wrong_data, &content_id, &VerificationHasher::Blake3)); |
| 355 | + } |
| 356 | + |
| 357 | + #[test] |
| 358 | + fn test_content_id_hex_serialization() { |
| 359 | + let data = b"Serialization test"; |
| 360 | + let original_id = ContentId::generate(data, &ContentHasher::Blake3); |
| 361 | + |
| 362 | + let hex_str = original_id.to_hex(); |
| 363 | + let restored_id = ContentId::from_hex(HashAlgorithm::Blake3, &hex_str).unwrap(); |
| 364 | + |
| 365 | + assert_eq!(original_id, restored_id); |
| 366 | + } |
| 367 | + |
| 368 | + #[test] |
| 369 | + fn test_multihash_format() { |
| 370 | + let data = b"Multihash test data"; |
| 371 | + let content_id = ContentId::generate(data, &ContentHasher::Blake3); |
| 372 | + |
| 373 | + let multihash = content_id.multihash_prefix(); |
| 374 | + let restored_id = ContentId::from_multihash(&multihash).unwrap(); |
| 375 | + |
| 376 | + assert_eq!(content_id, restored_id); |
| 377 | + |
| 378 | + // Test SHA-256 as well |
| 379 | + let sha256_id = ContentId::generate(data, &ContentHasher::Sha256); |
| 380 | + let sha256_multihash = sha256_id.multihash_prefix(); |
| 381 | + let restored_sha256 = ContentId::from_multihash(&sha256_multihash).unwrap(); |
| 382 | + |
| 383 | + assert_eq!(sha256_id, restored_sha256); |
| 384 | + } |
| 385 | + |
| 386 | + #[test] |
| 387 | + fn test_integrity_proof() { |
| 388 | + let data = b"Integrity proof test data"; |
| 389 | + let proof = ContentVerifier::create_integrity_proof(data); |
| 390 | + |
| 391 | + assert_eq!(proof.data_length, data.len() as u64); |
| 392 | + assert_eq!(proof.blake3_hash.algorithm, HashAlgorithm::Blake3); |
| 393 | + assert_eq!(proof.sha256_hash.algorithm, HashAlgorithm::Sha256); |
| 394 | + |
| 395 | + // Correct data should verify |
| 396 | + assert!(ContentVerifier::verify_integrity_proof(data, &proof)); |
| 397 | + |
| 398 | + // Wrong data should not verify |
| 399 | + let wrong_data = b"Wrong data"; |
| 400 | + assert!(!ContentVerifier::verify_integrity_proof(wrong_data, &proof)); |
| 401 | + } |
| 402 | + |
| 403 | + #[test] |
| 404 | + fn test_chunk_addressing() { |
| 405 | + let chunk_data = b"Chunk data for addressing test"; |
| 406 | + let file_id = b"test_file_12345"; |
| 407 | + |
| 408 | + let chunk_id = ChunkAddressing::chunk_id(chunk_data, 0, file_id); |
| 409 | + assert_eq!(chunk_id.algorithm, HashAlgorithm::Blake3); |
| 410 | + assert_eq!(chunk_id.hash.len(), 32); |
| 411 | + |
| 412 | + // Different chunk index should produce different ID |
| 413 | + let chunk_id2 = ChunkAddressing::chunk_id(chunk_data, 1, file_id); |
| 414 | + assert_ne!(chunk_id, chunk_id2); |
| 415 | + |
| 416 | + // Different file ID should produce different ID |
| 417 | + let chunk_id3 = ChunkAddressing::chunk_id(chunk_data, 0, b"different_file"); |
| 418 | + assert_ne!(chunk_id, chunk_id3); |
| 419 | + } |
| 420 | + |
| 421 | + #[test] |
| 422 | + fn test_file_id_from_chunks() { |
| 423 | + let chunk1 = ContentId::generate(b"chunk 1", &ContentHasher::Blake3); |
| 424 | + let chunk2 = ContentId::generate(b"chunk 2", &ContentHasher::Blake3); |
| 425 | + let chunk3 = ContentId::generate(b"chunk 3", &ContentHasher::Blake3); |
| 426 | + |
| 427 | + let file_id = ChunkAddressing::file_id_from_chunks(&[chunk1.clone(), chunk2.clone(), chunk3.clone()]); |
| 428 | + assert_eq!(file_id.algorithm, HashAlgorithm::Blake3); |
| 429 | + |
| 430 | + // Different order should produce different file ID |
| 431 | + let file_id2 = ChunkAddressing::file_id_from_chunks(&[chunk3, chunk1, chunk2]); |
| 432 | + assert_ne!(file_id, file_id2); |
| 433 | + } |
| 434 | + |
| 435 | + #[test] |
| 436 | + fn test_merkle_root() { |
| 437 | + let chunk1 = ContentId::generate(b"chunk 1", &ContentHasher::Blake3); |
| 438 | + let chunk2 = ContentId::generate(b"chunk 2", &ContentHasher::Blake3); |
| 439 | + let chunk3 = ContentId::generate(b"chunk 3", &ContentHasher::Blake3); |
| 440 | + let chunk4 = ContentId::generate(b"chunk 4", &ContentHasher::Blake3); |
| 441 | + |
| 442 | + // Single chunk |
| 443 | + let root1 = ChunkAddressing::merkle_root(&[chunk1.clone()]); |
| 444 | + assert_eq!(root1, chunk1); |
| 445 | + |
| 446 | + // Multiple chunks |
| 447 | + let root2 = ChunkAddressing::merkle_root(&[chunk1.clone(), chunk2.clone()]); |
| 448 | + let root4 = ChunkAddressing::merkle_root(&[chunk1, chunk2, chunk3, chunk4]); |
| 449 | + |
| 450 | + assert_ne!(root2, root4); |
| 451 | + assert_eq!(root2.algorithm, HashAlgorithm::Blake3); |
| 452 | + assert_eq!(root4.algorithm, HashAlgorithm::Blake3); |
| 453 | + |
| 454 | + // Empty chunks |
| 455 | + let empty_root = ChunkAddressing::merkle_root(&[]); |
| 456 | + assert_eq!(empty_root.algorithm, HashAlgorithm::Blake3); |
| 457 | + } |
| 458 | + |
| 459 | + #[test] |
| 460 | + fn test_content_id_display() { |
| 461 | + let content_id = ContentId::generate(b"display test", &ContentHasher::Blake3); |
| 462 | + let display_str = format!("{}", content_id); |
| 463 | + |
| 464 | + assert!(display_str.starts_with("blake3:")); |
| 465 | + assert_eq!(display_str.len(), 71); // "blake3:" (7) + hex hash (64) |
| 466 | + |
| 467 | + let sha256_id = ContentId::generate(b"display test", &ContentHasher::Sha256); |
| 468 | + let sha256_str = format!("{}", sha256_id); |
| 469 | + |
| 470 | + assert!(sha256_str.starts_with("sha256:")); |
| 471 | + assert_eq!(sha256_str.len(), 71); // "sha256:" (7) + hex hash (64) |
| 472 | + } |
| 473 | + |
| 474 | + #[test] |
| 475 | + fn test_multi_hash_verification() { |
| 476 | + let data = b"Multi hash verification test"; |
| 477 | + let blake3_id = ContentId::generate(data, &ContentHasher::Blake3); |
| 478 | + let sha256_id = ContentId::generate(data, &ContentHasher::Sha256); |
| 479 | + |
| 480 | + let ids = vec![blake3_id, sha256_id]; |
| 481 | + |
| 482 | + // Correct data should verify against all hashes |
| 483 | + assert!(ContentVerifier::verify_multi(data, &ids)); |
| 484 | + |
| 485 | + // Wrong data should fail verification |
| 486 | + let wrong_data = b"Wrong data"; |
| 487 | + assert!(!ContentVerifier::verify_multi(wrong_data, &ids)); |
| 488 | + } |
| 489 | +} |