| 1 |
//! Content addressing system for ZephyrFS |
| 2 |
//! |
| 3 |
//! Provides cryptographic content identifiers and verification using Blake3 and SHA-256. |
| 4 |
//! Content IDs are used for integrity verification and deduplication. |
| 5 |
|
| 6 |
use crate::crypto::{ContentHasher, VerificationHasher}; |
| 7 |
use blake3::Hasher as Blake3Hasher; |
| 8 |
use serde::{Deserialize, Serialize}; |
| 9 |
use sha2::{Digest, Sha256}; |
| 10 |
use std::fmt; |
| 11 |
|
| 12 |
/// Content identifier - cryptographic hash of data |
| 13 |
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] |
| 14 |
pub struct ContentId { |
| 15 |
/// Hash algorithm used |
| 16 |
pub algorithm: HashAlgorithm, |
| 17 |
/// Hash bytes |
| 18 |
pub hash: Vec<u8>, |
| 19 |
} |
| 20 |
|
| 21 |
/// Supported hash algorithms |
| 22 |
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] |
| 23 |
pub enum HashAlgorithm { |
| 24 |
Blake3, |
| 25 |
Sha256, |
| 26 |
} |
| 27 |
|
| 28 |
impl ContentId { |
| 29 |
/// Generate content ID from data |
| 30 |
pub fn generate(data: &[u8], hasher: &ContentHasher) -> Self { |
| 31 |
match hasher { |
| 32 |
ContentHasher::Blake3 => { |
| 33 |
let mut hasher = Blake3Hasher::new(); |
| 34 |
hasher.update(data); |
| 35 |
let hash = hasher.finalize(); |
| 36 |
|
| 37 |
Self { |
| 38 |
algorithm: HashAlgorithm::Blake3, |
| 39 |
hash: hash.as_bytes().to_vec(), |
| 40 |
} |
| 41 |
} |
| 42 |
ContentHasher::Sha256 => { |
| 43 |
let mut hasher = Sha256::new(); |
| 44 |
hasher.update(data); |
| 45 |
let hash = hasher.finalize(); |
| 46 |
|
| 47 |
Self { |
| 48 |
algorithm: HashAlgorithm::Sha256, |
| 49 |
hash: hash.to_vec(), |
| 50 |
} |
| 51 |
} |
| 52 |
} |
| 53 |
} |
| 54 |
|
| 55 |
/// Create from existing hash bytes and algorithm |
| 56 |
pub fn from_hash(algorithm: HashAlgorithm, hash: Vec<u8>) -> Self { |
| 57 |
Self { algorithm, hash } |
| 58 |
} |
| 59 |
|
| 60 |
/// Get hash bytes |
| 61 |
pub fn hash_bytes(&self) -> &[u8] { |
| 62 |
&self.hash |
| 63 |
} |
| 64 |
|
| 65 |
/// Get algorithm used |
| 66 |
pub fn algorithm(&self) -> &HashAlgorithm { |
| 67 |
&self.algorithm |
| 68 |
} |
| 69 |
|
| 70 |
/// Convert to hex string representation |
| 71 |
pub fn to_hex(&self) -> String { |
| 72 |
hex::encode(&self.hash) |
| 73 |
} |
| 74 |
|
| 75 |
/// Create from hex string |
| 76 |
pub fn from_hex(algorithm: HashAlgorithm, hex_str: &str) -> Result<Self, hex::FromHexError> { |
| 77 |
let hash = hex::decode(hex_str)?; |
| 78 |
Ok(Self { algorithm, hash }) |
| 79 |
} |
| 80 |
|
| 81 |
/// Get expected hash length for algorithm |
| 82 |
pub fn expected_length(&self) -> usize { |
| 83 |
match self.algorithm { |
| 84 |
HashAlgorithm::Blake3 => 32, // Blake3 output is 32 bytes |
| 85 |
HashAlgorithm::Sha256 => 32, // SHA-256 output is 32 bytes |
| 86 |
} |
| 87 |
} |
| 88 |
|
| 89 |
/// Validate hash length matches algorithm |
| 90 |
pub fn is_valid(&self) -> bool { |
| 91 |
self.hash.len() == self.expected_length() |
| 92 |
} |
| 93 |
|
| 94 |
/// Create a multihash-style prefix for the content ID |
| 95 |
pub fn multihash_prefix(&self) -> Vec<u8> { |
| 96 |
let mut prefix = Vec::new(); |
| 97 |
|
| 98 |
// Add algorithm identifier |
| 99 |
match self.algorithm { |
| 100 |
HashAlgorithm::Blake3 => { |
| 101 |
prefix.push(0x1e); // Blake3 multicodec |
| 102 |
prefix.push(32); // Hash length |
| 103 |
} |
| 104 |
HashAlgorithm::Sha256 => { |
| 105 |
prefix.push(0x12); // SHA-256 multicodec |
| 106 |
prefix.push(32); // Hash length |
| 107 |
} |
| 108 |
} |
| 109 |
|
| 110 |
prefix.extend_from_slice(&self.hash); |
| 111 |
prefix |
| 112 |
} |
| 113 |
|
| 114 |
/// Parse from multihash format |
| 115 |
pub fn from_multihash(data: &[u8]) -> Result<Self, String> { |
| 116 |
if data.len() < 2 { |
| 117 |
return Err("Multihash too short".to_string()); |
| 118 |
} |
| 119 |
|
| 120 |
let algorithm = match data[0] { |
| 121 |
0x1e => HashAlgorithm::Blake3, |
| 122 |
0x12 => HashAlgorithm::Sha256, |
| 123 |
_ => return Err(format!("Unsupported hash algorithm: {}", data[0])), |
| 124 |
}; |
| 125 |
|
| 126 |
let expected_len = data[1] as usize; |
| 127 |
if data.len() != expected_len + 2 { |
| 128 |
return Err(format!( |
| 129 |
"Invalid multihash length: expected {}, got {}", |
| 130 |
expected_len + 2, |
| 131 |
data.len() |
| 132 |
)); |
| 133 |
} |
| 134 |
|
| 135 |
let hash = data[2..].to_vec(); |
| 136 |
let content_id = Self { algorithm, hash }; |
| 137 |
|
| 138 |
if !content_id.is_valid() { |
| 139 |
return Err("Invalid hash length for algorithm".to_string()); |
| 140 |
} |
| 141 |
|
| 142 |
Ok(content_id) |
| 143 |
} |
| 144 |
} |
| 145 |
|
| 146 |
impl fmt::Display for ContentId { |
| 147 |
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 148 |
write!(f, "{}:{}", |
| 149 |
match self.algorithm { |
| 150 |
HashAlgorithm::Blake3 => "blake3", |
| 151 |
HashAlgorithm::Sha256 => "sha256", |
| 152 |
}, |
| 153 |
self.to_hex() |
| 154 |
) |
| 155 |
} |
| 156 |
} |
| 157 |
|
| 158 |
/// Content verification utilities |
| 159 |
pub struct ContentVerifier; |
| 160 |
|
| 161 |
impl ContentVerifier { |
| 162 |
/// Verify data matches expected content ID |
| 163 |
pub fn verify(data: &[u8], expected_id: &ContentId, hasher: &VerificationHasher) -> bool { |
| 164 |
let computed_id = match hasher { |
| 165 |
VerificationHasher::Blake3 => ContentId::generate(data, &ContentHasher::Blake3), |
| 166 |
VerificationHasher::Sha256 => ContentId::generate(data, &ContentHasher::Sha256), |
| 167 |
}; |
| 168 |
|
| 169 |
// Must use same algorithm and hash must match |
| 170 |
computed_id.algorithm == expected_id.algorithm && |
| 171 |
computed_id.hash == expected_id.hash |
| 172 |
} |
| 173 |
|
| 174 |
/// Verify data with multiple hash algorithms (for maximum security) |
| 175 |
pub fn verify_multi(data: &[u8], expected_ids: &[ContentId]) -> bool { |
| 176 |
for expected_id in expected_ids { |
| 177 |
let hasher = match expected_id.algorithm { |
| 178 |
HashAlgorithm::Blake3 => VerificationHasher::Blake3, |
| 179 |
HashAlgorithm::Sha256 => VerificationHasher::Sha256, |
| 180 |
}; |
| 181 |
|
| 182 |
if !Self::verify(data, expected_id, &hasher) { |
| 183 |
return false; |
| 184 |
} |
| 185 |
} |
| 186 |
true |
| 187 |
} |
| 188 |
|
| 189 |
/// Compute multiple hashes for data (Blake3 + SHA-256 for maximum security) |
| 190 |
pub fn compute_multi_hash(data: &[u8]) -> Vec<ContentId> { |
| 191 |
vec![ |
| 192 |
ContentId::generate(data, &ContentHasher::Blake3), |
| 193 |
ContentId::generate(data, &ContentHasher::Sha256), |
| 194 |
] |
| 195 |
} |
| 196 |
|
| 197 |
/// Create integrity proof for data chunk |
| 198 |
pub fn create_integrity_proof(data: &[u8]) -> IntegrityProof { |
| 199 |
IntegrityProof { |
| 200 |
blake3_hash: ContentId::generate(data, &ContentHasher::Blake3), |
| 201 |
sha256_hash: ContentId::generate(data, &ContentHasher::Sha256), |
| 202 |
data_length: data.len() as u64, |
| 203 |
} |
| 204 |
} |
| 205 |
|
| 206 |
/// Verify data against integrity proof |
| 207 |
pub fn verify_integrity_proof(data: &[u8], proof: &IntegrityProof) -> bool { |
| 208 |
if data.len() as u64 != proof.data_length { |
| 209 |
return false; |
| 210 |
} |
| 211 |
|
| 212 |
let blake3_ok = Self::verify(data, &proof.blake3_hash, &VerificationHasher::Blake3); |
| 213 |
let sha256_ok = Self::verify(data, &proof.sha256_hash, &VerificationHasher::Sha256); |
| 214 |
|
| 215 |
blake3_ok && sha256_ok |
| 216 |
} |
| 217 |
} |
| 218 |
|
| 219 |
/// Integrity proof containing multiple hashes for maximum security |
| 220 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 221 |
pub struct IntegrityProof { |
| 222 |
pub blake3_hash: ContentId, |
| 223 |
pub sha256_hash: ContentId, |
| 224 |
pub data_length: u64, |
| 225 |
} |
| 226 |
|
| 227 |
impl IntegrityProof { |
| 228 |
/// Get the primary content ID (Blake3 for performance) |
| 229 |
pub fn primary_id(&self) -> &ContentId { |
| 230 |
&self.blake3_hash |
| 231 |
} |
| 232 |
|
| 233 |
/// Get the secondary content ID (SHA-256 for compatibility) |
| 234 |
pub fn secondary_id(&self) -> &ContentId { |
| 235 |
&self.sha256_hash |
| 236 |
} |
| 237 |
|
| 238 |
/// Convert to multihash format (primary hash only) |
| 239 |
pub fn to_multihash(&self) -> Vec<u8> { |
| 240 |
self.blake3_hash.multihash_prefix() |
| 241 |
} |
| 242 |
} |
| 243 |
|
| 244 |
/// Content addressing utilities for file chunks |
| 245 |
pub struct ChunkAddressing; |
| 246 |
|
| 247 |
impl ChunkAddressing { |
| 248 |
/// Generate content ID for a file chunk with metadata |
| 249 |
pub fn chunk_id(chunk_data: &[u8], chunk_index: u64, file_id: &[u8]) -> ContentId { |
| 250 |
let mut hasher = Blake3Hasher::new(); |
| 251 |
|
| 252 |
// Include chunk metadata in hash for uniqueness |
| 253 |
hasher.update(b"ZephyrFS-chunk-v1:"); |
| 254 |
hasher.update(&chunk_index.to_be_bytes()); |
| 255 |
hasher.update(b":"); |
| 256 |
hasher.update(file_id); |
| 257 |
hasher.update(b":"); |
| 258 |
hasher.update(chunk_data); |
| 259 |
|
| 260 |
let hash = hasher.finalize(); |
| 261 |
ContentId { |
| 262 |
algorithm: HashAlgorithm::Blake3, |
| 263 |
hash: hash.as_bytes().to_vec(), |
| 264 |
} |
| 265 |
} |
| 266 |
|
| 267 |
/// Generate file-level content ID from chunk IDs |
| 268 |
pub fn file_id_from_chunks(chunk_ids: &[ContentId]) -> ContentId { |
| 269 |
let mut hasher = Blake3Hasher::new(); |
| 270 |
hasher.update(b"ZephyrFS-file-v1:"); |
| 271 |
|
| 272 |
for chunk_id in chunk_ids { |
| 273 |
hasher.update(&chunk_id.hash); |
| 274 |
} |
| 275 |
|
| 276 |
let hash = hasher.finalize(); |
| 277 |
ContentId { |
| 278 |
algorithm: HashAlgorithm::Blake3, |
| 279 |
hash: hash.as_bytes().to_vec(), |
| 280 |
} |
| 281 |
} |
| 282 |
|
| 283 |
/// Create Merkle tree root from chunk hashes |
| 284 |
pub fn merkle_root(chunk_ids: &[ContentId]) -> ContentId { |
| 285 |
if chunk_ids.is_empty() { |
| 286 |
// Empty file hash |
| 287 |
return ContentId::generate(b"", &ContentHasher::Blake3); |
| 288 |
} |
| 289 |
|
| 290 |
if chunk_ids.len() == 1 { |
| 291 |
return chunk_ids[0].clone(); |
| 292 |
} |
| 293 |
|
| 294 |
// Build Merkle tree bottom-up |
| 295 |
let mut level: Vec<ContentId> = chunk_ids.to_vec(); |
| 296 |
|
| 297 |
while level.len() > 1 { |
| 298 |
let mut next_level = Vec::new(); |
| 299 |
|
| 300 |
for pair in level.chunks(2) { |
| 301 |
let mut hasher = Blake3Hasher::new(); |
| 302 |
hasher.update(b"ZephyrFS-merkle-v1:"); |
| 303 |
hasher.update(&pair[0].hash); |
| 304 |
|
| 305 |
if pair.len() == 2 { |
| 306 |
hasher.update(&pair[1].hash); |
| 307 |
} else { |
| 308 |
// Odd number of nodes - hash with itself |
| 309 |
hasher.update(&pair[0].hash); |
| 310 |
} |
| 311 |
|
| 312 |
let hash = hasher.finalize(); |
| 313 |
next_level.push(ContentId { |
| 314 |
algorithm: HashAlgorithm::Blake3, |
| 315 |
hash: hash.as_bytes().to_vec(), |
| 316 |
}); |
| 317 |
} |
| 318 |
|
| 319 |
level = next_level; |
| 320 |
} |
| 321 |
|
| 322 |
level.into_iter().next().unwrap() |
| 323 |
} |
| 324 |
} |
| 325 |
|
| 326 |
#[cfg(test)] |
| 327 |
mod tests { |
| 328 |
use super::*; |
| 329 |
|
| 330 |
#[test] |
| 331 |
fn test_content_id_generation() { |
| 332 |
let data = b"Hello, ZephyrFS!"; |
| 333 |
|
| 334 |
let blake3_id = ContentId::generate(data, &ContentHasher::Blake3); |
| 335 |
let sha256_id = ContentId::generate(data, &ContentHasher::Sha256); |
| 336 |
|
| 337 |
assert_eq!(blake3_id.algorithm, HashAlgorithm::Blake3); |
| 338 |
assert_eq!(sha256_id.algorithm, HashAlgorithm::Sha256); |
| 339 |
assert_eq!(blake3_id.hash.len(), 32); |
| 340 |
assert_eq!(sha256_id.hash.len(), 32); |
| 341 |
assert_ne!(blake3_id.hash, sha256_id.hash); |
| 342 |
} |
| 343 |
|
| 344 |
#[test] |
| 345 |
fn test_content_verification() { |
| 346 |
let data = b"Test data for verification"; |
| 347 |
let content_id = ContentId::generate(data, &ContentHasher::Blake3); |
| 348 |
|
| 349 |
// Correct data should verify |
| 350 |
assert!(ContentVerifier::verify(data, &content_id, &VerificationHasher::Blake3)); |
| 351 |
|
| 352 |
// Wrong data should not verify |
| 353 |
let wrong_data = b"Wrong data"; |
| 354 |
assert!(!ContentVerifier::verify(wrong_data, &content_id, &VerificationHasher::Blake3)); |
| 355 |
} |
| 356 |
|
| 357 |
#[test] |
| 358 |
fn test_content_id_hex_serialization() { |
| 359 |
let data = b"Serialization test"; |
| 360 |
let original_id = ContentId::generate(data, &ContentHasher::Blake3); |
| 361 |
|
| 362 |
let hex_str = original_id.to_hex(); |
| 363 |
let restored_id = ContentId::from_hex(HashAlgorithm::Blake3, &hex_str).unwrap(); |
| 364 |
|
| 365 |
assert_eq!(original_id, restored_id); |
| 366 |
} |
| 367 |
|
| 368 |
#[test] |
| 369 |
fn test_multihash_format() { |
| 370 |
let data = b"Multihash test data"; |
| 371 |
let content_id = ContentId::generate(data, &ContentHasher::Blake3); |
| 372 |
|
| 373 |
let multihash = content_id.multihash_prefix(); |
| 374 |
let restored_id = ContentId::from_multihash(&multihash).unwrap(); |
| 375 |
|
| 376 |
assert_eq!(content_id, restored_id); |
| 377 |
|
| 378 |
// Test SHA-256 as well |
| 379 |
let sha256_id = ContentId::generate(data, &ContentHasher::Sha256); |
| 380 |
let sha256_multihash = sha256_id.multihash_prefix(); |
| 381 |
let restored_sha256 = ContentId::from_multihash(&sha256_multihash).unwrap(); |
| 382 |
|
| 383 |
assert_eq!(sha256_id, restored_sha256); |
| 384 |
} |
| 385 |
|
| 386 |
#[test] |
| 387 |
fn test_integrity_proof() { |
| 388 |
let data = b"Integrity proof test data"; |
| 389 |
let proof = ContentVerifier::create_integrity_proof(data); |
| 390 |
|
| 391 |
assert_eq!(proof.data_length, data.len() as u64); |
| 392 |
assert_eq!(proof.blake3_hash.algorithm, HashAlgorithm::Blake3); |
| 393 |
assert_eq!(proof.sha256_hash.algorithm, HashAlgorithm::Sha256); |
| 394 |
|
| 395 |
// Correct data should verify |
| 396 |
assert!(ContentVerifier::verify_integrity_proof(data, &proof)); |
| 397 |
|
| 398 |
// Wrong data should not verify |
| 399 |
let wrong_data = b"Wrong data"; |
| 400 |
assert!(!ContentVerifier::verify_integrity_proof(wrong_data, &proof)); |
| 401 |
} |
| 402 |
|
| 403 |
#[test] |
| 404 |
fn test_chunk_addressing() { |
| 405 |
let chunk_data = b"Chunk data for addressing test"; |
| 406 |
let file_id = b"test_file_12345"; |
| 407 |
|
| 408 |
let chunk_id = ChunkAddressing::chunk_id(chunk_data, 0, file_id); |
| 409 |
assert_eq!(chunk_id.algorithm, HashAlgorithm::Blake3); |
| 410 |
assert_eq!(chunk_id.hash.len(), 32); |
| 411 |
|
| 412 |
// Different chunk index should produce different ID |
| 413 |
let chunk_id2 = ChunkAddressing::chunk_id(chunk_data, 1, file_id); |
| 414 |
assert_ne!(chunk_id, chunk_id2); |
| 415 |
|
| 416 |
// Different file ID should produce different ID |
| 417 |
let chunk_id3 = ChunkAddressing::chunk_id(chunk_data, 0, b"different_file"); |
| 418 |
assert_ne!(chunk_id, chunk_id3); |
| 419 |
} |
| 420 |
|
| 421 |
#[test] |
| 422 |
fn test_file_id_from_chunks() { |
| 423 |
let chunk1 = ContentId::generate(b"chunk 1", &ContentHasher::Blake3); |
| 424 |
let chunk2 = ContentId::generate(b"chunk 2", &ContentHasher::Blake3); |
| 425 |
let chunk3 = ContentId::generate(b"chunk 3", &ContentHasher::Blake3); |
| 426 |
|
| 427 |
let file_id = ChunkAddressing::file_id_from_chunks(&[chunk1.clone(), chunk2.clone(), chunk3.clone()]); |
| 428 |
assert_eq!(file_id.algorithm, HashAlgorithm::Blake3); |
| 429 |
|
| 430 |
// Different order should produce different file ID |
| 431 |
let file_id2 = ChunkAddressing::file_id_from_chunks(&[chunk3, chunk1, chunk2]); |
| 432 |
assert_ne!(file_id, file_id2); |
| 433 |
} |
| 434 |
|
| 435 |
#[test] |
| 436 |
fn test_merkle_root() { |
| 437 |
let chunk1 = ContentId::generate(b"chunk 1", &ContentHasher::Blake3); |
| 438 |
let chunk2 = ContentId::generate(b"chunk 2", &ContentHasher::Blake3); |
| 439 |
let chunk3 = ContentId::generate(b"chunk 3", &ContentHasher::Blake3); |
| 440 |
let chunk4 = ContentId::generate(b"chunk 4", &ContentHasher::Blake3); |
| 441 |
|
| 442 |
// Single chunk |
| 443 |
let root1 = ChunkAddressing::merkle_root(&[chunk1.clone()]); |
| 444 |
assert_eq!(root1, chunk1); |
| 445 |
|
| 446 |
// Multiple chunks |
| 447 |
let root2 = ChunkAddressing::merkle_root(&[chunk1.clone(), chunk2.clone()]); |
| 448 |
let root4 = ChunkAddressing::merkle_root(&[chunk1, chunk2, chunk3, chunk4]); |
| 449 |
|
| 450 |
assert_ne!(root2, root4); |
| 451 |
assert_eq!(root2.algorithm, HashAlgorithm::Blake3); |
| 452 |
assert_eq!(root4.algorithm, HashAlgorithm::Blake3); |
| 453 |
|
| 454 |
// Empty chunks |
| 455 |
let empty_root = ChunkAddressing::merkle_root(&[]); |
| 456 |
assert_eq!(empty_root.algorithm, HashAlgorithm::Blake3); |
| 457 |
} |
| 458 |
|
| 459 |
#[test] |
| 460 |
fn test_content_id_display() { |
| 461 |
let content_id = ContentId::generate(b"display test", &ContentHasher::Blake3); |
| 462 |
let display_str = format!("{}", content_id); |
| 463 |
|
| 464 |
assert!(display_str.starts_with("blake3:")); |
| 465 |
assert_eq!(display_str.len(), 71); // "blake3:" (7) + hex hash (64) |
| 466 |
|
| 467 |
let sha256_id = ContentId::generate(b"display test", &ContentHasher::Sha256); |
| 468 |
let sha256_str = format!("{}", sha256_id); |
| 469 |
|
| 470 |
assert!(sha256_str.starts_with("sha256:")); |
| 471 |
assert_eq!(sha256_str.len(), 71); // "sha256:" (7) + hex hash (64) |
| 472 |
} |
| 473 |
|
| 474 |
#[test] |
| 475 |
fn test_multi_hash_verification() { |
| 476 |
let data = b"Multi hash verification test"; |
| 477 |
let blake3_id = ContentId::generate(data, &ContentHasher::Blake3); |
| 478 |
let sha256_id = ContentId::generate(data, &ContentHasher::Sha256); |
| 479 |
|
| 480 |
let ids = vec![blake3_id, sha256_id]; |
| 481 |
|
| 482 |
// Correct data should verify against all hashes |
| 483 |
assert!(ContentVerifier::verify_multi(data, &ids)); |
| 484 |
|
| 485 |
// Wrong data should fail verification |
| 486 |
let wrong_data = b"Wrong data"; |
| 487 |
assert!(!ContentVerifier::verify_multi(wrong_data, &ids)); |
| 488 |
} |
| 489 |
} |