zephyrfs-node Public

Watch 0 Fork 0 Star 0

Rust · 19106 bytes Raw Blame History

  
        1
        use anyhow::{Context, Result};
      
        2
        use serde::{Deserialize, Serialize};
      
        3
        use sha2::{Digest, Sha256};
      
        4
        use std::io::{Read, Seek, SeekFrom};
      
        5
        use tracing::{debug, info, warn};
      
        6
        
        7
        /// File chunking configuration following ZephyrFS architecture
      
        8
        /// 
      
        9
        /// Safety: Chunk sizes are validated and bounded to prevent memory exhaustion
      
        10
        /// Transparency: All chunking operations are logged with metadata
      
        11
        const DEFAULT_CHUNK_SIZE: usize = 1024 * 1024; // 1MB
      
        12
        const MIN_CHUNK_SIZE: usize = 64 * 1024;       // 64KB minimum
      
        13
        const MAX_CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB maximum
      
        14
        
        15
        /// Metadata for a file chunk
      
        16
        /// 
      
        17
        /// Privacy: Contains only structural information, no content
      
        18
        #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
      
        19
        pub struct ChunkInfo {
      
        20
            /// Unique identifier for the chunk
      
        21
            pub chunk_id: String,
      
        22
            
        23
            /// SHA-256 hash of chunk content for integrity verification
      
        24
            pub hash: String,
      
        25
            
        26
            /// Size of the chunk in bytes
      
        27
            pub size: u64,
      
        28
            
        29
            /// Position of this chunk within the original file
      
        30
            pub index: u32,
      
        31
            
        32
            /// Offset within the original file
      
        33
            pub offset: u64,
      
        34
        }
      
        35
        
        36
        /// Metadata for a chunked file
      
        37
        /// 
      
        38
        /// Transparency: Complete file reconstruction information available
      
        39
        #[derive(Debug, Clone, Serialize, Deserialize)]
      
        40
        pub struct FileMetadata {
      
        41
            /// Original file identifier
      
        42
            pub file_id: String,
      
        43
            
        44
            /// Original filename (for user convenience)
      
        45
            pub filename: String,
      
        46
            
        47
            /// Total size of original file
      
        48
            pub total_size: u64,
      
        49
            
        50
            /// SHA-256 hash of complete file for integrity verification
      
        51
            pub file_hash: String,
      
        52
            
        53
            /// Ordered list of chunks
      
        54
            pub chunks: Vec<ChunkInfo>,
      
        55
            
        56
            /// Chunk size used for this file
      
        57
            pub chunk_size: usize,
      
        58
            
        59
            /// MIME type if detected
      
        60
            pub mime_type: Option<String>,
      
        61
            
        62
            /// Creation timestamp
      
        63
            pub created_at: u64,
      
        64
        }
      
        65
        
        66
        /// File chunking engine with security and integrity focus
      
        67
        /// 
      
        68
        /// Safety: All operations include bounds checking and validation
      
        69
        /// Privacy: Original file content is never stored unencrypted
      
        70
        pub struct FileChunker {
      
        71
            chunk_size: usize,
      
        72
        }
      
        73
        
        74
        impl FileChunker {
      
        75
            /// Create a new FileChunker with specified chunk size
      
        76
            /// 
      
        77
            /// Safety: Validates chunk size is within safe bounds
      
        78
            pub fn new(chunk_size: Option<usize>) -> Result<Self> {
      
        79
                let chunk_size = chunk_size.unwrap_or(DEFAULT_CHUNK_SIZE);
      
        80
                
        81
                if chunk_size < MIN_CHUNK_SIZE || chunk_size > MAX_CHUNK_SIZE {
      
        82
                    anyhow::bail!(
      
        83
                        "Chunk size {} is outside safe bounds [{}, {}]",
      
        84
                        chunk_size, MIN_CHUNK_SIZE, MAX_CHUNK_SIZE
      
        85
                    );
      
        86
                }
      
        87
                
        88
                info!("Initialized FileChunker with chunk size: {} bytes", chunk_size);
      
        89
                Ok(Self { chunk_size })
      
        90
            }
      
        91
            
        92
            /// Create default FileChunker with 1MB chunks
      
        93
            pub fn default() -> Self {
      
        94
                Self::new(None).expect("Default chunk size should always be valid")
      
        95
            }
      
        96
            
        97
            /// Chunk a file from a reader
      
        98
            /// 
      
        99
            /// Safety: Uses bounded reads to prevent memory exhaustion
      
        100
            /// Transparency: All chunking steps are logged
      
        101
            pub fn chunk_file<R: Read + Seek>(
      
        102
                &self,
      
        103
                mut reader: R,
      
        104
                file_id: String,
      
        105
                filename: String,
      
        106
            ) -> Result<FileMetadata> {
      
        107
                info!("Chunking file: {} (ID: {})", filename, file_id);
      
        108
                
        109
                // Get total file size
      
        110
                let total_size = reader.seek(SeekFrom::End(0))
      
        111
                    .context("Failed to determine file size")?;
      
        112
                reader.seek(SeekFrom::Start(0))
      
        113
                    .context("Failed to seek to file start")?;
      
        114
                
        115
                if total_size == 0 {
      
        116
                    warn!("Attempting to chunk empty file: {}", filename);
      
        117
                    return Ok(FileMetadata {
      
        118
                        file_id,
      
        119
                        filename,
      
        120
                        total_size: 0,
      
        121
                        file_hash: self.calculate_empty_file_hash(),
      
        122
                        chunks: vec![],
      
        123
                        chunk_size: self.chunk_size,
      
        124
                        mime_type: None,
      
        125
                        created_at: std::time::SystemTime::now()
      
        126
                            .duration_since(std::time::UNIX_EPOCH)?
      
        127
                            .as_secs(),
      
        128
                    });
      
        129
                }
      
        130
                
        131
                let mut chunks = Vec::new();
      
        132
                let mut file_hasher = Sha256::new();
      
        133
                let mut buffer = vec![0u8; self.chunk_size];
      
        134
                let mut total_read = 0u64;
      
        135
                let mut chunk_index = 0u32;
      
        136
                
        137
                debug!("Starting to read file in {} byte chunks", self.chunk_size);
      
        138
                
        139
                loop {
      
        140
                    let bytes_read = reader.read(&mut buffer)
      
        141
                        .context("Failed to read from file")?;
      
        142
                    
        143
                    if bytes_read == 0 {
      
        144
                        break; // End of file
      
        145
                    }
      
        146
                    
        147
                    let chunk_data = &buffer[..bytes_read];
      
        148
                    
        149
                    // Update file hash with chunk data
      
        150
                    file_hasher.update(chunk_data);
      
        151
                    
        152
                    // Calculate chunk hash
      
        153
                    let chunk_hash = self.calculate_chunk_hash(chunk_data);
      
        154
                    
        155
                    // Generate chunk ID (content-addressable)
      
        156
                    let chunk_id = format!("chunk_{}", &chunk_hash[..16]);
      
        157
                    
        158
                    let chunk_info = ChunkInfo {
      
        159
                        chunk_id: chunk_id.clone(),
      
        160
                        hash: chunk_hash,
      
        161
                        size: bytes_read as u64,
      
        162
                        index: chunk_index,
      
        163
                        offset: total_read,
      
        164
                    };
      
        165
                    
        166
                    chunks.push(chunk_info);
      
        167
                    total_read += bytes_read as u64;
      
        168
                    chunk_index += 1;
      
        169
                    
        170
                    debug!(
      
        171
                        "Created chunk {} (index: {}, size: {} bytes, offset: {})",
      
        172
                        chunk_id, chunk_index - 1, bytes_read, total_read - bytes_read as u64
      
        173
                    );
      
        174
                }
      
        175
                
        176
                // Calculate final file hash
      
        177
                let file_hash = hex::encode(file_hasher.finalize());
      
        178
                
        179
                let metadata = FileMetadata {
      
        180
                    file_id,
      
        181
                    filename: filename.clone(),
      
        182
                    total_size,
      
        183
                    file_hash,
      
        184
                    chunks,
      
        185
                    chunk_size: self.chunk_size,
      
        186
                    mime_type: self.detect_mime_type(&filename),
      
        187
                    created_at: std::time::SystemTime::now()
      
        188
                        .duration_since(std::time::UNIX_EPOCH)?
      
        189
                        .as_secs(),
      
        190
                };
      
        191
                
        192
                info!(
      
        193
                    "Successfully chunked file {} into {} chunks (total: {} bytes)",
      
        194
                    filename, metadata.chunks.len(), total_size
      
        195
                );
      
        196
                
        197
                Ok(metadata)
      
        198
            }
      
        199
            
        200
            /// Chunk data from a byte slice
      
        201
            /// 
      
        202
            /// Safety: Memory-bounded operation suitable for smaller files
      
        203
            pub fn chunk_bytes(
      
        204
                &self,
      
        205
                data: &[u8],
      
        206
                file_id: String,
      
        207
                filename: String,
      
        208
            ) -> Result<FileMetadata> {
      
        209
                info!("Chunking {} bytes of data for file: {}", data.len(), filename);
      
        210
                
        211
                if data.is_empty() {
      
        212
                    return Ok(FileMetadata {
      
        213
                        file_id,
      
        214
                        filename,
      
        215
                        total_size: 0,
      
        216
                        file_hash: self.calculate_empty_file_hash(),
      
        217
                        chunks: vec![],
      
        218
                        chunk_size: self.chunk_size,
      
        219
                        mime_type: None,
      
        220
                        created_at: std::time::SystemTime::now()
      
        221
                            .duration_since(std::time::UNIX_EPOCH)?
      
        222
                            .as_secs(),
      
        223
                    });
      
        224
                }
      
        225
                
        226
                let mut chunks = Vec::new();
      
        227
                let mut file_hasher = Sha256::new();
      
        228
                file_hasher.update(data);
      
        229
                
        230
                for (chunk_index, chunk_data) in data.chunks(self.chunk_size).enumerate() {
      
        231
                    let chunk_hash = self.calculate_chunk_hash(chunk_data);
      
        232
                    let chunk_id = format!("chunk_{}", &chunk_hash[..16]);
      
        233
                    let offset = (chunk_index * self.chunk_size) as u64;
      
        234
                    
        235
                    let chunk_info = ChunkInfo {
      
        236
                        chunk_id: chunk_id.clone(),
      
        237
                        hash: chunk_hash,
      
        238
                        size: chunk_data.len() as u64,
      
        239
                        index: chunk_index as u32,
      
        240
                        offset,
      
        241
                    };
      
        242
                    
        243
                    chunks.push(chunk_info);
      
        244
                    
        245
                    debug!(
      
        246
                        "Created chunk {} (index: {}, size: {} bytes)",
      
        247
                        chunk_id, chunk_index, chunk_data.len()
      
        248
                    );
      
        249
                }
      
        250
                
        251
                let file_hash = hex::encode(file_hasher.finalize());
      
        252
                
        253
                let metadata = FileMetadata {
      
        254
                    file_id,
      
        255
                    filename: filename.clone(),
      
        256
                    total_size: data.len() as u64,
      
        257
                    file_hash,
      
        258
                    chunks,
      
        259
                    chunk_size: self.chunk_size,
      
        260
                    mime_type: self.detect_mime_type(&filename),
      
        261
                    created_at: std::time::SystemTime::now()
      
        262
                        .duration_since(std::time::UNIX_EPOCH)?
      
        263
                        .as_secs(),
      
        264
                };
      
        265
                
        266
                info!(
      
        267
                    "Successfully chunked {} bytes into {} chunks",
      
        268
                    data.len(), metadata.chunks.len()
      
        269
                );
      
        270
                
        271
                Ok(metadata)
      
        272
            }
      
        273
            
        274
            /// Reconstruct file data from chunks
      
        275
            /// 
      
        276
            /// Safety: Validates chunk order and integrity before reconstruction
      
        277
            /// Transparency: Reconstruction process is fully logged
      
        278
            pub fn reconstruct_file(&self, metadata: &FileMetadata, chunk_data: Vec<Vec<u8>>) -> Result<Vec<u8>> {
      
        279
                info!("Reconstructing file: {} ({} chunks)", metadata.filename, metadata.chunks.len());
      
        280
                
        281
                if chunk_data.len() != metadata.chunks.len() {
      
        282
                    anyhow::bail!(
      
        283
                        "Chunk data length {} doesn't match metadata chunks {}",
      
        284
                        chunk_data.len(), metadata.chunks.len()
      
        285
                    );
      
        286
                }
      
        287
                
        288
                // Verify all chunks are present and in order
      
        289
                for (i, (chunk_info, data)) in metadata.chunks.iter().zip(chunk_data.iter()).enumerate() {
      
        290
                    if chunk_info.index as usize != i {
      
        291
                        anyhow::bail!("Chunk {} is out of order (expected index {})", chunk_info.chunk_id, i);
      
        292
                    }
      
        293
                    
        294
                    if data.len() as u64 != chunk_info.size {
      
        295
                        anyhow::bail!(
      
        296
                            "Chunk {} size mismatch: expected {}, got {}",
      
        297
                            chunk_info.chunk_id, chunk_info.size, data.len()
      
        298
                        );
      
        299
                    }
      
        300
                    
        301
                    // Verify chunk hash
      
        302
                    let calculated_hash = self.calculate_chunk_hash(data);
      
        303
                    if calculated_hash != chunk_info.hash {
      
        304
                        anyhow::bail!("Chunk {} hash verification failed", chunk_info.chunk_id);
      
        305
                    }
      
        306
                }
      
        307
                
        308
                // Reconstruct file
      
        309
                let mut reconstructed = Vec::with_capacity(metadata.total_size as usize);
      
        310
                for data in chunk_data {
      
        311
                    reconstructed.extend_from_slice(&data);
      
        312
                }
      
        313
                
        314
                // Verify reconstructed file hash
      
        315
                let mut file_hasher = Sha256::new();
      
        316
                file_hasher.update(&reconstructed);
      
        317
                let calculated_hash = hex::encode(file_hasher.finalize());
      
        318
                
        319
                if calculated_hash != metadata.file_hash {
      
        320
                    anyhow::bail!("Reconstructed file hash verification failed");
      
        321
                }
      
        322
                
        323
                info!("Successfully reconstructed file: {} ({} bytes)", metadata.filename, reconstructed.len());
      
        324
                Ok(reconstructed)
      
        325
            }
      
        326
            
        327
            /// Calculate SHA-256 hash of chunk data
      
        328
            fn calculate_chunk_hash(&self, data: &[u8]) -> String {
      
        329
                let mut hasher = Sha256::new();
      
        330
                hasher.update(data);
      
        331
                hex::encode(hasher.finalize())
      
        332
            }
      
        333
            
        334
            /// Calculate hash for empty file (consistent across all empty files)
      
        335
            fn calculate_empty_file_hash(&self) -> String {
      
        336
                let hasher = Sha256::new();
      
        337
                hex::encode(hasher.finalize())
      
        338
            }
      
        339
            
        340
            /// Simple MIME type detection based on file extension
      
        341
            /// 
      
        342
            /// Privacy: Only uses filename extension, no content inspection
      
        343
            fn detect_mime_type(&self, filename: &str) -> Option<String> {
      
        344
                let extension = std::path::Path::new(filename)
      
        345
                    .extension()?
      
        346
                    .to_str()?
      
        347
                    .to_lowercase();
      
        348
                    
        349
                match extension.as_str() {
      
        350
                    "txt" | "md" => Some("text/plain".to_string()),
      
        351
                    "html" | "htm" => Some("text/html".to_string()),
      
        352
                    "json" => Some("application/json".to_string()),
      
        353
                    "pdf" => Some("application/pdf".to_string()),
      
        354
                    "jpg" | "jpeg" => Some("image/jpeg".to_string()),
      
        355
                    "png" => Some("image/png".to_string()),
      
        356
                    "gif" => Some("image/gif".to_string()),
      
        357
                    "zip" => Some("application/zip".to_string()),
      
        358
                    "tar" => Some("application/x-tar".to_string()),
      
        359
                    "gz" => Some("application/gzip".to_string()),
      
        360
                    _ => None,
      
        361
                }
      
        362
            }
      
        363
        }
      
        364
        
        365
        #[cfg(test)]
      
        366
        mod tests {
      
        367
            use super::*;
      
        368
            use std::io::Cursor;
      
        369
            
        370
            #[test]
      
        371
            fn test_file_chunker_creation() {
      
        372
                let chunker = FileChunker::default();
      
        373
                assert_eq!(chunker.chunk_size, DEFAULT_CHUNK_SIZE);
      
        374
                
        375
                let custom_chunker = FileChunker::new(Some(512 * 1024)).unwrap();
      
        376
                assert_eq!(custom_chunker.chunk_size, 512 * 1024);
      
        377
                
        378
                // Test invalid chunk sizes
      
        379
                assert!(FileChunker::new(Some(1024)).is_err()); // Too small
      
        380
                assert!(FileChunker::new(Some(32 * 1024 * 1024)).is_err()); // Too large
      
        381
            }
      
        382
            
        383
            #[test]
      
        384
            fn test_chunk_empty_data() {
      
        385
                let chunker = FileChunker::default();
      
        386
                let metadata = chunker.chunk_bytes(
      
        387
                    &[],
      
        388
                    "empty-test".to_string(),
      
        389
                    "empty.txt".to_string(),
      
        390
                ).unwrap();
      
        391
                
        392
                assert_eq!(metadata.total_size, 0);
      
        393
                assert!(metadata.chunks.is_empty());
      
        394
                assert!(!metadata.file_hash.is_empty());
      
        395
            }
      
        396
            
        397
            #[test]
      
        398
            fn test_chunk_small_data() {
      
        399
                let chunker = FileChunker::new(Some(128 * 1024)).unwrap(); // 128KB chunks
      
        400
                let test_data = b"Hello, ZephyrFS! This is a test file for chunking.";
      
        401
                
        402
                let metadata = chunker.chunk_bytes(
      
        403
                    test_data,
      
        404
                    "small-test".to_string(),
      
        405
                    "test.txt".to_string(),
      
        406
                ).unwrap();
      
        407
                
        408
                assert_eq!(metadata.total_size, test_data.len() as u64);
      
        409
                assert_eq!(metadata.chunks.len(), 1); // Should fit in one chunk
      
        410
                assert_eq!(metadata.chunks[0].size, test_data.len() as u64);
      
        411
                assert_eq!(metadata.chunks[0].index, 0);
      
        412
                assert_eq!(metadata.chunks[0].offset, 0);
      
        413
                assert_eq!(metadata.mime_type, Some("text/plain".to_string()));
      
        414
            }
      
        415
            
        416
            #[test]
      
        417
            fn test_chunk_large_data() {
      
        418
                let chunker = FileChunker::new(Some(64 * 1024)).unwrap(); // 64KB chunks for testing
      
        419
                let test_data = vec![42u8; 200 * 1024]; // 200KB of data
      
        420
                
        421
                let metadata = chunker.chunk_bytes(
      
        422
                    &test_data,
      
        423
                    "large-test".to_string(),
      
        424
                    "large.bin".to_string(),
      
        425
                ).unwrap();
      
        426
                
        427
                assert_eq!(metadata.total_size, 200 * 1024);
      
        428
                assert_eq!(metadata.chunks.len(), 4); // Should split into 4 chunks
      
        429
                
        430
                // Verify chunk sizes
      
        431
                assert_eq!(metadata.chunks[0].size, 64 * 1024);
      
        432
                assert_eq!(metadata.chunks[1].size, 64 * 1024);
      
        433
                assert_eq!(metadata.chunks[2].size, 64 * 1024);
      
        434
                assert_eq!(metadata.chunks[3].size, 8 * 1024); // Remainder
      
        435
                
        436
                // Verify offsets
      
        437
                assert_eq!(metadata.chunks[0].offset, 0);
      
        438
                assert_eq!(metadata.chunks[1].offset, 64 * 1024);
      
        439
                assert_eq!(metadata.chunks[2].offset, 128 * 1024);
      
        440
                assert_eq!(metadata.chunks[3].offset, 192 * 1024);
      
        441
            }
      
        442
            
        443
            #[test]
      
        444
            fn test_file_reconstruction() {
      
        445
                let chunker = FileChunker::new(Some(64 * 1024)).unwrap();
      
        446
                let original_data = b"The quick brown fox jumps over the lazy dog. ".repeat(50);
      
        447
                
        448
                // Chunk the data
      
        449
                let metadata = chunker.chunk_bytes(
      
        450
                    &original_data,
      
        451
                    "reconstruction-test".to_string(),
      
        452
                    "test.txt".to_string(),
      
        453
                ).unwrap();
      
        454
                
        455
                // Extract chunk data (simulating retrieval from storage)
      
        456
                let mut chunk_data = Vec::new();
      
        457
                let mut offset = 0;
      
        458
                for chunk_info in &metadata.chunks {
      
        459
                    let end = offset + chunk_info.size as usize;
      
        460
                    chunk_data.push(original_data[offset..end].to_vec());
      
        461
                    offset = end;
      
        462
                }
      
        463
                
        464
                // Reconstruct the file
      
        465
                let reconstructed = chunker.reconstruct_file(&metadata, chunk_data).unwrap();
      
        466
                
        467
                assert_eq!(reconstructed, original_data);
      
        468
            }
      
        469
            
        470
            #[test]
      
        471
            fn test_chunk_reader() {
      
        472
                let chunker = FileChunker::new(Some(64 * 1024)).unwrap();
      
        473
                let test_data = b"This is test data for the reader-based chunking functionality.";
      
        474
                let mut cursor = Cursor::new(test_data);
      
        475
                
        476
                let metadata = chunker.chunk_file(
      
        477
                    &mut cursor,
      
        478
                    "reader-test".to_string(),
      
        479
                    "reader.txt".to_string(),
      
        480
                ).unwrap();
      
        481
                
        482
                assert_eq!(metadata.total_size, test_data.len() as u64);
      
        483
                assert_eq!(metadata.chunks.len(), 1); // Small data fits in one chunk
      
        484
                assert!(!metadata.file_hash.is_empty());
      
        485
            }
      
        486
            
        487
            #[test]
      
        488
            fn test_hash_consistency() {
      
        489
                let chunker = FileChunker::default();
      
        490
                let test_data = b"Consistent hashing test data";
      
        491
                
        492
                // Chunk the same data twice
      
        493
                let metadata1 = chunker.chunk_bytes(
      
        494
                    test_data,
      
        495
                    "hash-test-1".to_string(),
      
        496
                    "hash.txt".to_string(),
      
        497
                ).unwrap();
      
        498
                
        499
                let metadata2 = chunker.chunk_bytes(
      
        500
                    test_data,
      
        501
                    "hash-test-2".to_string(),
      
        502
                    "hash.txt".to_string(),
      
        503
                ).unwrap();
      
        504
                
        505
                // File hashes should be identical
      
        506
                assert_eq!(metadata1.file_hash, metadata2.file_hash);
      
        507
                assert_eq!(metadata1.chunks[0].hash, metadata2.chunks[0].hash);
      
        508
            }
      
        509
            
        510
            #[test]
      
        511
            fn test_mime_type_detection() {
      
        512
                let chunker = FileChunker::default();
      
        513
                
        514
                assert_eq!(chunker.detect_mime_type("test.txt"), Some("text/plain".to_string()));
      
        515
                assert_eq!(chunker.detect_mime_type("doc.pdf"), Some("application/pdf".to_string()));
      
        516
                assert_eq!(chunker.detect_mime_type("image.png"), Some("image/png".to_string()));
      
        517
                assert_eq!(chunker.detect_mime_type("unknown.xyz"), None);
      
        518
            }
      
        519
            
        520
            #[test]
      
        521
            fn test_chunk_integrity_verification() {
      
        522
                let chunker = FileChunker::new(Some(64 * 1024)).unwrap();
      
        523
                let test_data = vec![1u8; 2048]; // 2KB data
      
        524
                
        525
                let metadata = chunker.chunk_bytes(
      
        526
                    &test_data,
      
        527
                    "integrity-test".to_string(),
      
        528
                    "integrity.bin".to_string(),
      
        529
                ).unwrap();
      
        530
                
        531
                // With 64KB chunks, 2KB data will be in a single chunk
      
        532
                let chunk_data = vec![test_data.clone()];
      
        533
                
        534
                // Should reconstruct successfully
      
        535
                let reconstructed = chunker.reconstruct_file(&metadata, chunk_data).unwrap();
      
        536
                assert_eq!(reconstructed, test_data);
      
        537
                
        538
                // Test with corrupted chunk
      
        539
                let corrupted_chunk_data = vec![
      
        540
                    vec![0u8; 2048], // Corrupted chunk (same size but different data)
      
        541
                ];
      
        542
                
        543
                // Should fail reconstruction
      
        544
                assert!(chunker.reconstruct_file(&metadata, corrupted_chunk_data).is_err());
      
        545
            }
      
        546
        }

1	use anyhow::{Context, Result};
2	use serde::{Deserialize, Serialize};
3	use sha2::{Digest, Sha256};
4	use std::io::{Read, Seek, SeekFrom};
5	use tracing::{debug, info, warn};
6
7	/// File chunking configuration following ZephyrFS architecture
8	///
9	/// Safety: Chunk sizes are validated and bounded to prevent memory exhaustion
10	/// Transparency: All chunking operations are logged with metadata
11	const DEFAULT_CHUNK_SIZE: usize = 1024 * 1024; // 1MB
12	const MIN_CHUNK_SIZE: usize = 64 * 1024; // 64KB minimum
13	const MAX_CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB maximum
14
15	/// Metadata for a file chunk
16	///
17	/// Privacy: Contains only structural information, no content
18	#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
19	pub struct ChunkInfo {
20	/// Unique identifier for the chunk
21	pub chunk_id: String,
22
23	/// SHA-256 hash of chunk content for integrity verification
24	pub hash: String,
25
26	/// Size of the chunk in bytes
27	pub size: u64,
28
29	/// Position of this chunk within the original file
30	pub index: u32,
31
32	/// Offset within the original file
33	pub offset: u64,
34	}
35
36	/// Metadata for a chunked file
37	///
38	/// Transparency: Complete file reconstruction information available
39	#[derive(Debug, Clone, Serialize, Deserialize)]
40	pub struct FileMetadata {
41	/// Original file identifier
42	pub file_id: String,
43
44	/// Original filename (for user convenience)
45	pub filename: String,
46
47	/// Total size of original file
48	pub total_size: u64,
49
50	/// SHA-256 hash of complete file for integrity verification
51	pub file_hash: String,
52
53	/// Ordered list of chunks
54	pub chunks: Vec<ChunkInfo>,
55
56	/// Chunk size used for this file
57	pub chunk_size: usize,
58
59	/// MIME type if detected
60	pub mime_type: Option<String>,
61
62	/// Creation timestamp
63	pub created_at: u64,
64	}
65
66	/// File chunking engine with security and integrity focus
67	///
68	/// Safety: All operations include bounds checking and validation
69	/// Privacy: Original file content is never stored unencrypted
70	pub struct FileChunker {
71	chunk_size: usize,
72	}
73
74	impl FileChunker {
75	/// Create a new FileChunker with specified chunk size
76	///
77	/// Safety: Validates chunk size is within safe bounds
78	pub fn new(chunk_size: Option<usize>) -> Result<Self> {
79	let chunk_size = chunk_size.unwrap_or(DEFAULT_CHUNK_SIZE);
80
81	if chunk_size < MIN_CHUNK_SIZE \|\| chunk_size > MAX_CHUNK_SIZE {
82	anyhow::bail!(
83	"Chunk size {} is outside safe bounds [{}, {}]",
84	chunk_size, MIN_CHUNK_SIZE, MAX_CHUNK_SIZE
85	);
86	}
87
88	info!("Initialized FileChunker with chunk size: {} bytes", chunk_size);
89	Ok(Self { chunk_size })
90	}
91
92	/// Create default FileChunker with 1MB chunks
93	pub fn default() -> Self {
94	Self::new(None).expect("Default chunk size should always be valid")
95	}
96
97	/// Chunk a file from a reader
98	///
99	/// Safety: Uses bounded reads to prevent memory exhaustion
100	/// Transparency: All chunking steps are logged
101	pub fn chunk_file<R: Read + Seek>(
102	&self,
103	mut reader: R,
104	file_id: String,
105	filename: String,
106	) -> Result<FileMetadata> {
107	info!("Chunking file: {} (ID: {})", filename, file_id);
108
109	// Get total file size
110	let total_size = reader.seek(SeekFrom::End(0))
111	.context("Failed to determine file size")?;
112	reader.seek(SeekFrom::Start(0))
113	.context("Failed to seek to file start")?;
114
115	if total_size == 0 {
116	warn!("Attempting to chunk empty file: {}", filename);
117	return Ok(FileMetadata {
118	file_id,
119	filename,
120	total_size: 0,
121	file_hash: self.calculate_empty_file_hash(),
122	chunks: vec![],
123	chunk_size: self.chunk_size,
124	mime_type: None,
125	created_at: std::time::SystemTime::now()
126	.duration_since(std::time::UNIX_EPOCH)?
127	.as_secs(),
128	});
129	}
130
131	let mut chunks = Vec::new();
132	let mut file_hasher = Sha256::new();
133	let mut buffer = vec![0u8; self.chunk_size];
134	let mut total_read = 0u64;
135	let mut chunk_index = 0u32;
136
137	debug!("Starting to read file in {} byte chunks", self.chunk_size);
138
139	loop {
140	let bytes_read = reader.read(&mut buffer)
141	.context("Failed to read from file")?;
142
143	if bytes_read == 0 {
144	break; // End of file
145	}
146
147	let chunk_data = &buffer[..bytes_read];
148
149	// Update file hash with chunk data
150	file_hasher.update(chunk_data);
151
152	// Calculate chunk hash
153	let chunk_hash = self.calculate_chunk_hash(chunk_data);
154
155	// Generate chunk ID (content-addressable)
156	let chunk_id = format!("chunk_{}", &chunk_hash[..16]);
157
158	let chunk_info = ChunkInfo {
159	chunk_id: chunk_id.clone(),
160	hash: chunk_hash,
161	size: bytes_read as u64,
162	index: chunk_index,
163	offset: total_read,
164	};
165
166	chunks.push(chunk_info);
167	total_read += bytes_read as u64;
168	chunk_index += 1;
169
170	debug!(
171	"Created chunk {} (index: {}, size: {} bytes, offset: {})",
172	chunk_id, chunk_index - 1, bytes_read, total_read - bytes_read as u64
173	);
174	}
175
176	// Calculate final file hash
177	let file_hash = hex::encode(file_hasher.finalize());
178
179	let metadata = FileMetadata {
180	file_id,
181	filename: filename.clone(),
182	total_size,
183	file_hash,
184	chunks,
185	chunk_size: self.chunk_size,
186	mime_type: self.detect_mime_type(&filename),
187	created_at: std::time::SystemTime::now()
188	.duration_since(std::time::UNIX_EPOCH)?
189	.as_secs(),
190	};
191
192	info!(
193	"Successfully chunked file {} into {} chunks (total: {} bytes)",
194	filename, metadata.chunks.len(), total_size
195	);
196
197	Ok(metadata)
198	}
199
200	/// Chunk data from a byte slice
201	///
202	/// Safety: Memory-bounded operation suitable for smaller files
203	pub fn chunk_bytes(
204	&self,
205	data: &[u8],
206	file_id: String,
207	filename: String,
208	) -> Result<FileMetadata> {
209	info!("Chunking {} bytes of data for file: {}", data.len(), filename);
210
211	if data.is_empty() {
212	return Ok(FileMetadata {
213	file_id,
214	filename,
215	total_size: 0,
216	file_hash: self.calculate_empty_file_hash(),
217	chunks: vec![],
218	chunk_size: self.chunk_size,
219	mime_type: None,
220	created_at: std::time::SystemTime::now()
221	.duration_since(std::time::UNIX_EPOCH)?
222	.as_secs(),
223	});
224	}
225
226	let mut chunks = Vec::new();
227	let mut file_hasher = Sha256::new();
228	file_hasher.update(data);
229
230	for (chunk_index, chunk_data) in data.chunks(self.chunk_size).enumerate() {
231	let chunk_hash = self.calculate_chunk_hash(chunk_data);
232	let chunk_id = format!("chunk_{}", &chunk_hash[..16]);
233	let offset = (chunk_index * self.chunk_size) as u64;
234
235	let chunk_info = ChunkInfo {
236	chunk_id: chunk_id.clone(),
237	hash: chunk_hash,
238	size: chunk_data.len() as u64,
239	index: chunk_index as u32,
240	offset,
241	};
242
243	chunks.push(chunk_info);
244
245	debug!(
246	"Created chunk {} (index: {}, size: {} bytes)",
247	chunk_id, chunk_index, chunk_data.len()
248	);
249	}
250
251	let file_hash = hex::encode(file_hasher.finalize());
252
253	let metadata = FileMetadata {
254	file_id,
255	filename: filename.clone(),
256	total_size: data.len() as u64,
257	file_hash,
258	chunks,
259	chunk_size: self.chunk_size,
260	mime_type: self.detect_mime_type(&filename),
261	created_at: std::time::SystemTime::now()
262	.duration_since(std::time::UNIX_EPOCH)?
263	.as_secs(),
264	};
265
266	info!(
267	"Successfully chunked {} bytes into {} chunks",
268	data.len(), metadata.chunks.len()
269	);
270
271	Ok(metadata)
272	}
273
274	/// Reconstruct file data from chunks
275	///
276	/// Safety: Validates chunk order and integrity before reconstruction
277	/// Transparency: Reconstruction process is fully logged
278	pub fn reconstruct_file(&self, metadata: &FileMetadata, chunk_data: Vec<Vec<u8>>) -> Result<Vec<u8>> {
279	info!("Reconstructing file: {} ({} chunks)", metadata.filename, metadata.chunks.len());
280
281	if chunk_data.len() != metadata.chunks.len() {
282	anyhow::bail!(
283	"Chunk data length {} doesn't match metadata chunks {}",
284	chunk_data.len(), metadata.chunks.len()
285	);
286	}
287
288	// Verify all chunks are present and in order
289	for (i, (chunk_info, data)) in metadata.chunks.iter().zip(chunk_data.iter()).enumerate() {
290	if chunk_info.index as usize != i {
291	anyhow::bail!("Chunk {} is out of order (expected index {})", chunk_info.chunk_id, i);
292	}
293
294	if data.len() as u64 != chunk_info.size {
295	anyhow::bail!(
296	"Chunk {} size mismatch: expected {}, got {}",
297	chunk_info.chunk_id, chunk_info.size, data.len()
298	);
299	}
300
301	// Verify chunk hash
302	let calculated_hash = self.calculate_chunk_hash(data);
303	if calculated_hash != chunk_info.hash {
304	anyhow::bail!("Chunk {} hash verification failed", chunk_info.chunk_id);
305	}
306	}
307
308	// Reconstruct file
309	let mut reconstructed = Vec::with_capacity(metadata.total_size as usize);
310	for data in chunk_data {
311	reconstructed.extend_from_slice(&data);
312	}
313
314	// Verify reconstructed file hash
315	let mut file_hasher = Sha256::new();
316	file_hasher.update(&reconstructed);
317	let calculated_hash = hex::encode(file_hasher.finalize());
318
319	if calculated_hash != metadata.file_hash {
320	anyhow::bail!("Reconstructed file hash verification failed");
321	}
322
323	info!("Successfully reconstructed file: {} ({} bytes)", metadata.filename, reconstructed.len());
324	Ok(reconstructed)
325	}
326
327	/// Calculate SHA-256 hash of chunk data
328	fn calculate_chunk_hash(&self, data: &[u8]) -> String {
329	let mut hasher = Sha256::new();
330	hasher.update(data);
331	hex::encode(hasher.finalize())
332	}
333
334	/// Calculate hash for empty file (consistent across all empty files)
335	fn calculate_empty_file_hash(&self) -> String {
336	let hasher = Sha256::new();
337	hex::encode(hasher.finalize())
338	}
339
340	/// Simple MIME type detection based on file extension
341	///
342	/// Privacy: Only uses filename extension, no content inspection
343	fn detect_mime_type(&self, filename: &str) -> Option<String> {
344	let extension = std::path::Path::new(filename)
345	.extension()?
346	.to_str()?
347	.to_lowercase();
348
349	match extension.as_str() {
350	"txt" \| "md" => Some("text/plain".to_string()),
351	"html" \| "htm" => Some("text/html".to_string()),
352	"json" => Some("application/json".to_string()),
353	"pdf" => Some("application/pdf".to_string()),
354	"jpg" \| "jpeg" => Some("image/jpeg".to_string()),
355	"png" => Some("image/png".to_string()),
356	"gif" => Some("image/gif".to_string()),
357	"zip" => Some("application/zip".to_string()),
358	"tar" => Some("application/x-tar".to_string()),
359	"gz" => Some("application/gzip".to_string()),
360	_ => None,
361	}
362	}
363	}
364
365	#[cfg(test)]
366	mod tests {
367	use super::*;
368	use std::io::Cursor;
369
370	#[test]
371	fn test_file_chunker_creation() {
372	let chunker = FileChunker::default();
373	assert_eq!(chunker.chunk_size, DEFAULT_CHUNK_SIZE);
374
375	let custom_chunker = FileChunker::new(Some(512 * 1024)).unwrap();
376	assert_eq!(custom_chunker.chunk_size, 512 * 1024);
377
378	// Test invalid chunk sizes
379	assert!(FileChunker::new(Some(1024)).is_err()); // Too small
380	assert!(FileChunker::new(Some(32 * 1024 * 1024)).is_err()); // Too large
381	}
382
383	#[test]
384	fn test_chunk_empty_data() {
385	let chunker = FileChunker::default();
386	let metadata = chunker.chunk_bytes(
387	&[],
388	"empty-test".to_string(),
389	"empty.txt".to_string(),
390	).unwrap();
391
392	assert_eq!(metadata.total_size, 0);
393	assert!(metadata.chunks.is_empty());
394	assert!(!metadata.file_hash.is_empty());
395	}
396
397	#[test]
398	fn test_chunk_small_data() {
399	let chunker = FileChunker::new(Some(128 * 1024)).unwrap(); // 128KB chunks
400	let test_data = b"Hello, ZephyrFS! This is a test file for chunking.";
401
402	let metadata = chunker.chunk_bytes(
403	test_data,
404	"small-test".to_string(),
405	"test.txt".to_string(),
406	).unwrap();
407
408	assert_eq!(metadata.total_size, test_data.len() as u64);
409	assert_eq!(metadata.chunks.len(), 1); // Should fit in one chunk
410	assert_eq!(metadata.chunks[0].size, test_data.len() as u64);
411	assert_eq!(metadata.chunks[0].index, 0);
412	assert_eq!(metadata.chunks[0].offset, 0);
413	assert_eq!(metadata.mime_type, Some("text/plain".to_string()));
414	}
415
416	#[test]
417	fn test_chunk_large_data() {
418	let chunker = FileChunker::new(Some(64 * 1024)).unwrap(); // 64KB chunks for testing
419	let test_data = vec![42u8; 200 * 1024]; // 200KB of data
420
421	let metadata = chunker.chunk_bytes(
422	&test_data,
423	"large-test".to_string(),
424	"large.bin".to_string(),
425	).unwrap();
426
427	assert_eq!(metadata.total_size, 200 * 1024);
428	assert_eq!(metadata.chunks.len(), 4); // Should split into 4 chunks
429
430	// Verify chunk sizes
431	assert_eq!(metadata.chunks[0].size, 64 * 1024);
432	assert_eq!(metadata.chunks[1].size, 64 * 1024);
433	assert_eq!(metadata.chunks[2].size, 64 * 1024);
434	assert_eq!(metadata.chunks[3].size, 8 * 1024); // Remainder
435
436	// Verify offsets
437	assert_eq!(metadata.chunks[0].offset, 0);
438	assert_eq!(metadata.chunks[1].offset, 64 * 1024);
439	assert_eq!(metadata.chunks[2].offset, 128 * 1024);
440	assert_eq!(metadata.chunks[3].offset, 192 * 1024);
441	}
442
443	#[test]
444	fn test_file_reconstruction() {
445	let chunker = FileChunker::new(Some(64 * 1024)).unwrap();
446	let original_data = b"The quick brown fox jumps over the lazy dog. ".repeat(50);
447
448	// Chunk the data
449	let metadata = chunker.chunk_bytes(
450	&original_data,
451	"reconstruction-test".to_string(),
452	"test.txt".to_string(),
453	).unwrap();
454
455	// Extract chunk data (simulating retrieval from storage)
456	let mut chunk_data = Vec::new();
457	let mut offset = 0;
458	for chunk_info in &metadata.chunks {
459	let end = offset + chunk_info.size as usize;
460	chunk_data.push(original_data[offset..end].to_vec());
461	offset = end;
462	}
463
464	// Reconstruct the file
465	let reconstructed = chunker.reconstruct_file(&metadata, chunk_data).unwrap();
466
467	assert_eq!(reconstructed, original_data);
468	}
469
470	#[test]
471	fn test_chunk_reader() {
472	let chunker = FileChunker::new(Some(64 * 1024)).unwrap();
473	let test_data = b"This is test data for the reader-based chunking functionality.";
474	let mut cursor = Cursor::new(test_data);
475
476	let metadata = chunker.chunk_file(
477	&mut cursor,
478	"reader-test".to_string(),
479	"reader.txt".to_string(),
480	).unwrap();
481
482	assert_eq!(metadata.total_size, test_data.len() as u64);
483	assert_eq!(metadata.chunks.len(), 1); // Small data fits in one chunk
484	assert!(!metadata.file_hash.is_empty());
485	}
486
487	#[test]
488	fn test_hash_consistency() {
489	let chunker = FileChunker::default();
490	let test_data = b"Consistent hashing test data";
491
492	// Chunk the same data twice
493	let metadata1 = chunker.chunk_bytes(
494	test_data,
495	"hash-test-1".to_string(),
496	"hash.txt".to_string(),
497	).unwrap();
498
499	let metadata2 = chunker.chunk_bytes(
500	test_data,
501	"hash-test-2".to_string(),
502	"hash.txt".to_string(),
503	).unwrap();
504
505	// File hashes should be identical
506	assert_eq!(metadata1.file_hash, metadata2.file_hash);
507	assert_eq!(metadata1.chunks[0].hash, metadata2.chunks[0].hash);
508	}
509
510	#[test]
511	fn test_mime_type_detection() {
512	let chunker = FileChunker::default();
513
514	assert_eq!(chunker.detect_mime_type("test.txt"), Some("text/plain".to_string()));
515	assert_eq!(chunker.detect_mime_type("doc.pdf"), Some("application/pdf".to_string()));
516	assert_eq!(chunker.detect_mime_type("image.png"), Some("image/png".to_string()));
517	assert_eq!(chunker.detect_mime_type("unknown.xyz"), None);
518	}
519
520	#[test]
521	fn test_chunk_integrity_verification() {
522	let chunker = FileChunker::new(Some(64 * 1024)).unwrap();
523	let test_data = vec![1u8; 2048]; // 2KB data
524
525	let metadata = chunker.chunk_bytes(
526	&test_data,
527	"integrity-test".to_string(),
528	"integrity.bin".to_string(),
529	).unwrap();
530
531	// With 64KB chunks, 2KB data will be in a single chunk
532	let chunk_data = vec![test_data.clone()];
533
534	// Should reconstruct successfully
535	let reconstructed = chunker.reconstruct_file(&metadata, chunk_data).unwrap();
536	assert_eq!(reconstructed, test_data);
537
538	// Test with corrupted chunk
539	let corrupted_chunk_data = vec![
540	vec![0u8; 2048], // Corrupted chunk (same size but different data)
541	];
542
543	// Should fail reconstruction
544	assert!(chunker.reconstruct_file(&metadata, corrupted_chunk_data).is_err());
545	}
546	}