Rust · 19106 bytes Raw Blame History
1 use anyhow::{Context, Result};
2 use serde::{Deserialize, Serialize};
3 use sha2::{Digest, Sha256};
4 use std::io::{Read, Seek, SeekFrom};
5 use tracing::{debug, info, warn};
6
7 /// File chunking configuration following ZephyrFS architecture
8 ///
9 /// Safety: Chunk sizes are validated and bounded to prevent memory exhaustion
10 /// Transparency: All chunking operations are logged with metadata
11 const DEFAULT_CHUNK_SIZE: usize = 1024 * 1024; // 1MB
12 const MIN_CHUNK_SIZE: usize = 64 * 1024; // 64KB minimum
13 const MAX_CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB maximum
14
15 /// Metadata for a file chunk
16 ///
17 /// Privacy: Contains only structural information, no content
18 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
19 pub struct ChunkInfo {
20 /// Unique identifier for the chunk
21 pub chunk_id: String,
22
23 /// SHA-256 hash of chunk content for integrity verification
24 pub hash: String,
25
26 /// Size of the chunk in bytes
27 pub size: u64,
28
29 /// Position of this chunk within the original file
30 pub index: u32,
31
32 /// Offset within the original file
33 pub offset: u64,
34 }
35
36 /// Metadata for a chunked file
37 ///
38 /// Transparency: Complete file reconstruction information available
39 #[derive(Debug, Clone, Serialize, Deserialize)]
40 pub struct FileMetadata {
41 /// Original file identifier
42 pub file_id: String,
43
44 /// Original filename (for user convenience)
45 pub filename: String,
46
47 /// Total size of original file
48 pub total_size: u64,
49
50 /// SHA-256 hash of complete file for integrity verification
51 pub file_hash: String,
52
53 /// Ordered list of chunks
54 pub chunks: Vec<ChunkInfo>,
55
56 /// Chunk size used for this file
57 pub chunk_size: usize,
58
59 /// MIME type if detected
60 pub mime_type: Option<String>,
61
62 /// Creation timestamp
63 pub created_at: u64,
64 }
65
66 /// File chunking engine with security and integrity focus
67 ///
68 /// Safety: All operations include bounds checking and validation
69 /// Privacy: Original file content is never stored unencrypted
70 pub struct FileChunker {
71 chunk_size: usize,
72 }
73
74 impl FileChunker {
75 /// Create a new FileChunker with specified chunk size
76 ///
77 /// Safety: Validates chunk size is within safe bounds
78 pub fn new(chunk_size: Option<usize>) -> Result<Self> {
79 let chunk_size = chunk_size.unwrap_or(DEFAULT_CHUNK_SIZE);
80
81 if chunk_size < MIN_CHUNK_SIZE || chunk_size > MAX_CHUNK_SIZE {
82 anyhow::bail!(
83 "Chunk size {} is outside safe bounds [{}, {}]",
84 chunk_size, MIN_CHUNK_SIZE, MAX_CHUNK_SIZE
85 );
86 }
87
88 info!("Initialized FileChunker with chunk size: {} bytes", chunk_size);
89 Ok(Self { chunk_size })
90 }
91
92 /// Create default FileChunker with 1MB chunks
93 pub fn default() -> Self {
94 Self::new(None).expect("Default chunk size should always be valid")
95 }
96
97 /// Chunk a file from a reader
98 ///
99 /// Safety: Uses bounded reads to prevent memory exhaustion
100 /// Transparency: All chunking steps are logged
101 pub fn chunk_file<R: Read + Seek>(
102 &self,
103 mut reader: R,
104 file_id: String,
105 filename: String,
106 ) -> Result<FileMetadata> {
107 info!("Chunking file: {} (ID: {})", filename, file_id);
108
109 // Get total file size
110 let total_size = reader.seek(SeekFrom::End(0))
111 .context("Failed to determine file size")?;
112 reader.seek(SeekFrom::Start(0))
113 .context("Failed to seek to file start")?;
114
115 if total_size == 0 {
116 warn!("Attempting to chunk empty file: {}", filename);
117 return Ok(FileMetadata {
118 file_id,
119 filename,
120 total_size: 0,
121 file_hash: self.calculate_empty_file_hash(),
122 chunks: vec![],
123 chunk_size: self.chunk_size,
124 mime_type: None,
125 created_at: std::time::SystemTime::now()
126 .duration_since(std::time::UNIX_EPOCH)?
127 .as_secs(),
128 });
129 }
130
131 let mut chunks = Vec::new();
132 let mut file_hasher = Sha256::new();
133 let mut buffer = vec![0u8; self.chunk_size];
134 let mut total_read = 0u64;
135 let mut chunk_index = 0u32;
136
137 debug!("Starting to read file in {} byte chunks", self.chunk_size);
138
139 loop {
140 let bytes_read = reader.read(&mut buffer)
141 .context("Failed to read from file")?;
142
143 if bytes_read == 0 {
144 break; // End of file
145 }
146
147 let chunk_data = &buffer[..bytes_read];
148
149 // Update file hash with chunk data
150 file_hasher.update(chunk_data);
151
152 // Calculate chunk hash
153 let chunk_hash = self.calculate_chunk_hash(chunk_data);
154
155 // Generate chunk ID (content-addressable)
156 let chunk_id = format!("chunk_{}", &chunk_hash[..16]);
157
158 let chunk_info = ChunkInfo {
159 chunk_id: chunk_id.clone(),
160 hash: chunk_hash,
161 size: bytes_read as u64,
162 index: chunk_index,
163 offset: total_read,
164 };
165
166 chunks.push(chunk_info);
167 total_read += bytes_read as u64;
168 chunk_index += 1;
169
170 debug!(
171 "Created chunk {} (index: {}, size: {} bytes, offset: {})",
172 chunk_id, chunk_index - 1, bytes_read, total_read - bytes_read as u64
173 );
174 }
175
176 // Calculate final file hash
177 let file_hash = hex::encode(file_hasher.finalize());
178
179 let metadata = FileMetadata {
180 file_id,
181 filename: filename.clone(),
182 total_size,
183 file_hash,
184 chunks,
185 chunk_size: self.chunk_size,
186 mime_type: self.detect_mime_type(&filename),
187 created_at: std::time::SystemTime::now()
188 .duration_since(std::time::UNIX_EPOCH)?
189 .as_secs(),
190 };
191
192 info!(
193 "Successfully chunked file {} into {} chunks (total: {} bytes)",
194 filename, metadata.chunks.len(), total_size
195 );
196
197 Ok(metadata)
198 }
199
200 /// Chunk data from a byte slice
201 ///
202 /// Safety: Memory-bounded operation suitable for smaller files
203 pub fn chunk_bytes(
204 &self,
205 data: &[u8],
206 file_id: String,
207 filename: String,
208 ) -> Result<FileMetadata> {
209 info!("Chunking {} bytes of data for file: {}", data.len(), filename);
210
211 if data.is_empty() {
212 return Ok(FileMetadata {
213 file_id,
214 filename,
215 total_size: 0,
216 file_hash: self.calculate_empty_file_hash(),
217 chunks: vec![],
218 chunk_size: self.chunk_size,
219 mime_type: None,
220 created_at: std::time::SystemTime::now()
221 .duration_since(std::time::UNIX_EPOCH)?
222 .as_secs(),
223 });
224 }
225
226 let mut chunks = Vec::new();
227 let mut file_hasher = Sha256::new();
228 file_hasher.update(data);
229
230 for (chunk_index, chunk_data) in data.chunks(self.chunk_size).enumerate() {
231 let chunk_hash = self.calculate_chunk_hash(chunk_data);
232 let chunk_id = format!("chunk_{}", &chunk_hash[..16]);
233 let offset = (chunk_index * self.chunk_size) as u64;
234
235 let chunk_info = ChunkInfo {
236 chunk_id: chunk_id.clone(),
237 hash: chunk_hash,
238 size: chunk_data.len() as u64,
239 index: chunk_index as u32,
240 offset,
241 };
242
243 chunks.push(chunk_info);
244
245 debug!(
246 "Created chunk {} (index: {}, size: {} bytes)",
247 chunk_id, chunk_index, chunk_data.len()
248 );
249 }
250
251 let file_hash = hex::encode(file_hasher.finalize());
252
253 let metadata = FileMetadata {
254 file_id,
255 filename: filename.clone(),
256 total_size: data.len() as u64,
257 file_hash,
258 chunks,
259 chunk_size: self.chunk_size,
260 mime_type: self.detect_mime_type(&filename),
261 created_at: std::time::SystemTime::now()
262 .duration_since(std::time::UNIX_EPOCH)?
263 .as_secs(),
264 };
265
266 info!(
267 "Successfully chunked {} bytes into {} chunks",
268 data.len(), metadata.chunks.len()
269 );
270
271 Ok(metadata)
272 }
273
274 /// Reconstruct file data from chunks
275 ///
276 /// Safety: Validates chunk order and integrity before reconstruction
277 /// Transparency: Reconstruction process is fully logged
278 pub fn reconstruct_file(&self, metadata: &FileMetadata, chunk_data: Vec<Vec<u8>>) -> Result<Vec<u8>> {
279 info!("Reconstructing file: {} ({} chunks)", metadata.filename, metadata.chunks.len());
280
281 if chunk_data.len() != metadata.chunks.len() {
282 anyhow::bail!(
283 "Chunk data length {} doesn't match metadata chunks {}",
284 chunk_data.len(), metadata.chunks.len()
285 );
286 }
287
288 // Verify all chunks are present and in order
289 for (i, (chunk_info, data)) in metadata.chunks.iter().zip(chunk_data.iter()).enumerate() {
290 if chunk_info.index as usize != i {
291 anyhow::bail!("Chunk {} is out of order (expected index {})", chunk_info.chunk_id, i);
292 }
293
294 if data.len() as u64 != chunk_info.size {
295 anyhow::bail!(
296 "Chunk {} size mismatch: expected {}, got {}",
297 chunk_info.chunk_id, chunk_info.size, data.len()
298 );
299 }
300
301 // Verify chunk hash
302 let calculated_hash = self.calculate_chunk_hash(data);
303 if calculated_hash != chunk_info.hash {
304 anyhow::bail!("Chunk {} hash verification failed", chunk_info.chunk_id);
305 }
306 }
307
308 // Reconstruct file
309 let mut reconstructed = Vec::with_capacity(metadata.total_size as usize);
310 for data in chunk_data {
311 reconstructed.extend_from_slice(&data);
312 }
313
314 // Verify reconstructed file hash
315 let mut file_hasher = Sha256::new();
316 file_hasher.update(&reconstructed);
317 let calculated_hash = hex::encode(file_hasher.finalize());
318
319 if calculated_hash != metadata.file_hash {
320 anyhow::bail!("Reconstructed file hash verification failed");
321 }
322
323 info!("Successfully reconstructed file: {} ({} bytes)", metadata.filename, reconstructed.len());
324 Ok(reconstructed)
325 }
326
327 /// Calculate SHA-256 hash of chunk data
328 fn calculate_chunk_hash(&self, data: &[u8]) -> String {
329 let mut hasher = Sha256::new();
330 hasher.update(data);
331 hex::encode(hasher.finalize())
332 }
333
334 /// Calculate hash for empty file (consistent across all empty files)
335 fn calculate_empty_file_hash(&self) -> String {
336 let hasher = Sha256::new();
337 hex::encode(hasher.finalize())
338 }
339
340 /// Simple MIME type detection based on file extension
341 ///
342 /// Privacy: Only uses filename extension, no content inspection
343 fn detect_mime_type(&self, filename: &str) -> Option<String> {
344 let extension = std::path::Path::new(filename)
345 .extension()?
346 .to_str()?
347 .to_lowercase();
348
349 match extension.as_str() {
350 "txt" | "md" => Some("text/plain".to_string()),
351 "html" | "htm" => Some("text/html".to_string()),
352 "json" => Some("application/json".to_string()),
353 "pdf" => Some("application/pdf".to_string()),
354 "jpg" | "jpeg" => Some("image/jpeg".to_string()),
355 "png" => Some("image/png".to_string()),
356 "gif" => Some("image/gif".to_string()),
357 "zip" => Some("application/zip".to_string()),
358 "tar" => Some("application/x-tar".to_string()),
359 "gz" => Some("application/gzip".to_string()),
360 _ => None,
361 }
362 }
363 }
364
365 #[cfg(test)]
366 mod tests {
367 use super::*;
368 use std::io::Cursor;
369
370 #[test]
371 fn test_file_chunker_creation() {
372 let chunker = FileChunker::default();
373 assert_eq!(chunker.chunk_size, DEFAULT_CHUNK_SIZE);
374
375 let custom_chunker = FileChunker::new(Some(512 * 1024)).unwrap();
376 assert_eq!(custom_chunker.chunk_size, 512 * 1024);
377
378 // Test invalid chunk sizes
379 assert!(FileChunker::new(Some(1024)).is_err()); // Too small
380 assert!(FileChunker::new(Some(32 * 1024 * 1024)).is_err()); // Too large
381 }
382
383 #[test]
384 fn test_chunk_empty_data() {
385 let chunker = FileChunker::default();
386 let metadata = chunker.chunk_bytes(
387 &[],
388 "empty-test".to_string(),
389 "empty.txt".to_string(),
390 ).unwrap();
391
392 assert_eq!(metadata.total_size, 0);
393 assert!(metadata.chunks.is_empty());
394 assert!(!metadata.file_hash.is_empty());
395 }
396
397 #[test]
398 fn test_chunk_small_data() {
399 let chunker = FileChunker::new(Some(128 * 1024)).unwrap(); // 128KB chunks
400 let test_data = b"Hello, ZephyrFS! This is a test file for chunking.";
401
402 let metadata = chunker.chunk_bytes(
403 test_data,
404 "small-test".to_string(),
405 "test.txt".to_string(),
406 ).unwrap();
407
408 assert_eq!(metadata.total_size, test_data.len() as u64);
409 assert_eq!(metadata.chunks.len(), 1); // Should fit in one chunk
410 assert_eq!(metadata.chunks[0].size, test_data.len() as u64);
411 assert_eq!(metadata.chunks[0].index, 0);
412 assert_eq!(metadata.chunks[0].offset, 0);
413 assert_eq!(metadata.mime_type, Some("text/plain".to_string()));
414 }
415
416 #[test]
417 fn test_chunk_large_data() {
418 let chunker = FileChunker::new(Some(64 * 1024)).unwrap(); // 64KB chunks for testing
419 let test_data = vec![42u8; 200 * 1024]; // 200KB of data
420
421 let metadata = chunker.chunk_bytes(
422 &test_data,
423 "large-test".to_string(),
424 "large.bin".to_string(),
425 ).unwrap();
426
427 assert_eq!(metadata.total_size, 200 * 1024);
428 assert_eq!(metadata.chunks.len(), 4); // Should split into 4 chunks
429
430 // Verify chunk sizes
431 assert_eq!(metadata.chunks[0].size, 64 * 1024);
432 assert_eq!(metadata.chunks[1].size, 64 * 1024);
433 assert_eq!(metadata.chunks[2].size, 64 * 1024);
434 assert_eq!(metadata.chunks[3].size, 8 * 1024); // Remainder
435
436 // Verify offsets
437 assert_eq!(metadata.chunks[0].offset, 0);
438 assert_eq!(metadata.chunks[1].offset, 64 * 1024);
439 assert_eq!(metadata.chunks[2].offset, 128 * 1024);
440 assert_eq!(metadata.chunks[3].offset, 192 * 1024);
441 }
442
443 #[test]
444 fn test_file_reconstruction() {
445 let chunker = FileChunker::new(Some(64 * 1024)).unwrap();
446 let original_data = b"The quick brown fox jumps over the lazy dog. ".repeat(50);
447
448 // Chunk the data
449 let metadata = chunker.chunk_bytes(
450 &original_data,
451 "reconstruction-test".to_string(),
452 "test.txt".to_string(),
453 ).unwrap();
454
455 // Extract chunk data (simulating retrieval from storage)
456 let mut chunk_data = Vec::new();
457 let mut offset = 0;
458 for chunk_info in &metadata.chunks {
459 let end = offset + chunk_info.size as usize;
460 chunk_data.push(original_data[offset..end].to_vec());
461 offset = end;
462 }
463
464 // Reconstruct the file
465 let reconstructed = chunker.reconstruct_file(&metadata, chunk_data).unwrap();
466
467 assert_eq!(reconstructed, original_data);
468 }
469
470 #[test]
471 fn test_chunk_reader() {
472 let chunker = FileChunker::new(Some(64 * 1024)).unwrap();
473 let test_data = b"This is test data for the reader-based chunking functionality.";
474 let mut cursor = Cursor::new(test_data);
475
476 let metadata = chunker.chunk_file(
477 &mut cursor,
478 "reader-test".to_string(),
479 "reader.txt".to_string(),
480 ).unwrap();
481
482 assert_eq!(metadata.total_size, test_data.len() as u64);
483 assert_eq!(metadata.chunks.len(), 1); // Small data fits in one chunk
484 assert!(!metadata.file_hash.is_empty());
485 }
486
487 #[test]
488 fn test_hash_consistency() {
489 let chunker = FileChunker::default();
490 let test_data = b"Consistent hashing test data";
491
492 // Chunk the same data twice
493 let metadata1 = chunker.chunk_bytes(
494 test_data,
495 "hash-test-1".to_string(),
496 "hash.txt".to_string(),
497 ).unwrap();
498
499 let metadata2 = chunker.chunk_bytes(
500 test_data,
501 "hash-test-2".to_string(),
502 "hash.txt".to_string(),
503 ).unwrap();
504
505 // File hashes should be identical
506 assert_eq!(metadata1.file_hash, metadata2.file_hash);
507 assert_eq!(metadata1.chunks[0].hash, metadata2.chunks[0].hash);
508 }
509
510 #[test]
511 fn test_mime_type_detection() {
512 let chunker = FileChunker::default();
513
514 assert_eq!(chunker.detect_mime_type("test.txt"), Some("text/plain".to_string()));
515 assert_eq!(chunker.detect_mime_type("doc.pdf"), Some("application/pdf".to_string()));
516 assert_eq!(chunker.detect_mime_type("image.png"), Some("image/png".to_string()));
517 assert_eq!(chunker.detect_mime_type("unknown.xyz"), None);
518 }
519
520 #[test]
521 fn test_chunk_integrity_verification() {
522 let chunker = FileChunker::new(Some(64 * 1024)).unwrap();
523 let test_data = vec![1u8; 2048]; // 2KB data
524
525 let metadata = chunker.chunk_bytes(
526 &test_data,
527 "integrity-test".to_string(),
528 "integrity.bin".to_string(),
529 ).unwrap();
530
531 // With 64KB chunks, 2KB data will be in a single chunk
532 let chunk_data = vec![test_data.clone()];
533
534 // Should reconstruct successfully
535 let reconstructed = chunker.reconstruct_file(&metadata, chunk_data).unwrap();
536 assert_eq!(reconstructed, test_data);
537
538 // Test with corrupted chunk
539 let corrupted_chunk_data = vec![
540 vec![0u8; 2048], // Corrupted chunk (same size but different data)
541 ];
542
543 // Should fail reconstruction
544 assert!(chunker.reconstruct_file(&metadata, corrupted_chunk_data).is_err());
545 }
546 }