Loading page…
Rust walkthroughs
Loading page…
zip::ZipArchive handle large files and what are the memory implications of loading vs extracting on-the-fly?The zip crate's ZipArchive provides two primary approaches for handling compressed data: loading the entire archive into memory versus streaming extraction. The choice between these approaches has significant memory implications, especially when dealing with large archives or files within archives that exceed available memory.
use std::fs::File;
use zip::ZipArchive;
fn basic_usage() -> Result<(), Box<dyn std::error::Error>> {
let file = File::open("archive.zip")?;
let mut archive = ZipArchive::new(file)?;
// Archive metadata is loaded
println!("Contains {} files", archive.len());
// Extract a file
let mut entry = archive.by_name("document.txt")?;
let mut output = File::create("document.txt")?;
std::io::copy(&mut entry, &mut output)?;
Ok(())
}The ZipArchive::new() call reads the archive's central directory, not the entire file content.
use std::fs::File;
use zip::ZipArchive;
fn memory_model() -> Result<(), Box<dyn std::error::Error>> {
// ZipArchive loads:
// 1. The Zip end of central directory record
// 2. The central directory (file metadata)
// 3. NOT the compressed file contents
let file = File::open("large_archive.zip")?;
let archive = ZipArchive::new(file)?;
// Memory usage is proportional to:
// - Number of files in archive
// - Length of file names
// - Metadata per file
// NOT the size of compressed data
println!("{} files in archive", archive.len());
println!("Archive size in memory: metadata only, not content");
Ok(())
}ZipArchive loads metadata but not file contents, making it efficient for large archives.
use std::fs::File;
use std::io::{self, Read, Write};
use zip::ZipArchive;
fn streaming_extraction() -> Result<(), Box<dyn std::error::Error>> {
let file = File::open("large_archive.zip")?;
let mut archive = ZipArchive::new(file)?;
// Extract without loading entire file into memory
for i in 0..archive.len() {
let mut entry = archive.by_index(i)?;
let out_path = entry.mangled_name();
if entry.is_file() {
let mut output = File::create(&out_path)?;
io::copy(&mut entry, &mut output)?;
println!("Extracted: {:?} ({} bytes)", out_path, entry.size());
}
}
Ok(())
}io::copy streams data in chunks, avoiding loading the entire file into memory.
use std::fs::File;
use zip::ZipArchive;
fn memory_problem() -> Result<(), Box<dyn std::error::Error>> {
let file = File::open("archive.zip")?;
let mut archive = ZipArchive::new(file)?;
let mut entry = archive.by_name("huge_file.bin")?;
// BAD: Loading entire file into memory
let mut buffer = Vec::new();
entry.read_to_end(&mut buffer)?;
// If huge_file.bin is 1GB compressed, 5GB uncompressed,
// we now have 5GB in memory!
println!("Loaded {} bytes into memory", buffer.len());
Ok(())
}
fn better_approach() -> Result<(), Box<dyn std::error::Error>> {
let file = File::open("archive.zip")?;
let mut archive = ZipArchive::new(file)?;
let mut entry = archive.by_name("huge_file.bin")?;
let mut output = File::create("huge_file.bin")?;
// GOOD: Stream to disk
std::io::copy(&mut entry, &mut output)?;
// Memory usage is constant (buffer size ~8KB)
Ok(())
}Streaming keeps memory usage constant regardless of file size.
use std::fs::File;
use std::io::{self, BufWriter, Write};
use zip::ZipArchive;
fn process_large_archive() -> Result<(), Box<dyn std::error::Error>> {
// Even with a 10GB archive, ZipArchive only loads
// the central directory (metadata)
let file = File::open("huge_archive.zip")?;
let mut archive = ZipArchive::new(file)?;
println!("Archive contains {} files", archive.len());
// Process each file without loading all into memory
for i in 0..archive.len() {
let mut entry = archive.by_index(i)?;
if entry.is_file() {
let name = entry.name().to_string();
println!("Processing: {}", name);
// Stream to output
let output = File::create(format!("output/{}", name))?;
let mut writer = BufWriter::new(output);
io::copy(&mut entry, &mut writer)?;
}
}
Ok(())
}Each file is extracted independently, keeping memory bounded.
use std::fs::File;
use std::io::{self, BufReader, BufWriter};
use zip::ZipArchive;
fn buffered_extraction() -> Result<(), Box<dyn std::error::Error>> {
// The underlying file can be buffered for better performance
let file = File::open("archive.zip")?;
let buf_reader = BufReader::new(file);
let mut archive = ZipArchive::new(buf_reader)?;
for i in 0..archive.len() {
let mut entry = archive.by_index(i)?;
if entry.is_file() {
let output = File::create(entry.mangled_name())?;
let mut writer = BufWriter::new(output);
// io::copy uses internal buffers
io::copy(&mut entry, &mut writer)?;
}
}
Ok(())
}Buffering improves I/O performance for many small files.
use std::fs::File;
use std::io::Read;
use zip::ZipArchive;
fn process_in_memory_chunks() -> Result<(), Box<dyn std::error::Error>> {
let file = File::open("archive.zip")?;
let mut archive = ZipArchive::new(file)?;
let mut entry = archive.by_name("data.csv")?;
// Process in chunks instead of loading entire file
let mut buffer = [0u8; 8192]; // 8KB buffer
let mut total_bytes = 0u64;
loop {
let bytes_read = entry.read(&mut buffer)?;
if bytes_read == 0 {
break;
}
// Process this chunk
process_chunk(&buffer[..bytes_read]);
total_bytes += bytes_read as u64;
}
println!("Processed {} bytes", total_bytes);
Ok(())
}
fn process_chunk(data: &[u8]) {
// Process chunk without loading entire file
}Chunk-by-chunk processing keeps memory bounded even during computation.
use std::fs::File;
use zip::ZipArchive;
fn size_handling() -> Result<(), Box<dyn std::error::Error>> {
let file = File::open("archive.zip")?;
let mut archive = ZipArchive::new(file)?;
for i in 0..archive.len() {
let entry = archive.by_index(i)?;
println!("File: {}", entry.name());
println!(" Compressed: {} bytes", entry.compressed_size());
println!(" Uncompressed: {} bytes", entry.size());
println!(" Ratio: {:.1}%",
(entry.compressed_size() as f64 / entry.size() as f64) * 100.0);
}
Ok(())
}Knowing both sizes helps estimate memory needs for in-memory operations.
use std::fs::File;
use std::io::Read;
use zip::ZipArchive;
fn zipfile_lifecycle() -> Result<(), Box<dyn std::error::Error>> {
let file = File::open("archive.zip")?;
let mut archive = ZipArchive::new(file)?;
// by_index() and by_name() return ZipFile which implements Read
{
let mut entry = archive.by_index(0)?;
// entry is a Read handle to the decompressed data
let mut first_byte = [0u8; 1];
entry.read_exact(&mut first_byte)?;
// entry goes out of scope, handle is closed
}
// Can now access another file
let mut entry2 = archive.by_index(1)?;
// Important: Only one file can be accessed at a time
// This would panic:
// let mut entry1 = archive.by_index(0)?;
// let mut entry2 = archive.by_index(1)?; // PANIC!
Ok(())
}Only one ZipFile can be active at a time from a single ZipArchive.
use std::fs::File;
use std::io::Read;
use zip::ZipArchive;
fn concurrent_access_problem() -> Result<(), Box<dyn std::error::Error>> {
let file = File::open("archive.zip")?;
let mut archive = ZipArchive::new(file)?;
// This pattern doesn't work:
// let mut file1 = archive.by_index(0)?;
// let mut file2 = archive.by_index(1)?; // Would panic
// Instead, extract one at a time
{
let mut entry = archive.by_index(0)?;
// Process file 0
}
{
let mut entry = archive.by_index(1)?;
// Process file 1
}
Ok(())
}
// For true concurrent access, open multiple archive handles
fn concurrent_access_solution() -> Result<(), Box<dyn std::error::Error>> {
let file1 = File::open("archive.zip")?;
let file2 = File::open("archive.zip")?;
let mut archive1 = ZipArchive::new(file1)?;
let mut archive2 = ZipArchive::new(file2)?;
let mut entry1 = archive1.by_index(0)?;
let mut entry2 = archive2.by_index(1)?;
// Now both are accessible simultaneously
// But this means two file handles and two central directories in memory
Ok(())
}Multiple archive instances allow concurrent access but use more resources.
use std::fs::File;
use std::io::Read;
use zip::ZipArchive;
// Note: The zip crate doesn't directly support memory-mapped files,
// but you can provide any Read + Seek implementation
fn with_vec_buffer() -> Result<(), Box<dyn std::error::Error>> {
// Loading entire archive into memory first
let file = File::open("archive.zip")?;
let mut buffer = Vec::new();
file.take(100_000_000) // Limit to 100MB for safety
.read_to_end(&mut buffer)?;
// Create archive from in-memory buffer
let cursor = std::io::Cursor::new(buffer);
let mut archive = ZipArchive::new(cursor)?;
// Now everything is in memory
// Fast random access, but uses memory for entire archive
Ok(())
}Loading the entire archive into memory enables fast random access at the cost of memory.
use std::fs::File;
use zip::ZipArchive;
fn estimate_memory() -> Result<(), Box<dyn std::error::Error>> {
let file = File::open("archive.zip")?;
let archive = ZipArchive::new(file)?;
// Central directory memory is roughly:
// - ~50 bytes per file entry
// - Length of each filename
// - Some overhead
let mut name_bytes = 0usize;
for i in 0..archive.len() {
let entry = archive.by_index(i)?;
name_bytes += entry.name().len();
}
let estimated_metadata = archive.len() * 50 + name_bytes;
println!("Files in archive: {}", archive.len());
println!("Filename bytes: {}", name_bytes);
println!("Estimated metadata: {} bytes", estimated_metadata);
println!("Memory for extraction: O(largest_file_uncompressed)");
Ok(())
}Memory for the archive is proportional to file count, not file sizes.
use std::fs::File;
use std::io;
use zip::ZipArchive;
fn extract_specific_files() -> Result<(), Box<dyn std::error::Error>> {
let file = File::open("archive.zip")?;
let mut archive = ZipArchive::new(file)?;
// Only extract what you need
let needed_files = ["config.json", "data.csv", "output.log"];
for name in &needed_files {
match archive.by_name(name) {
Ok(mut entry) => {
let output = File::create(name)?;
io::copy(&mut entry, &output)?;
println!("Extracted: {}", name);
}
Err(_) => {
println!("File not found: {}", name);
}
}
}
// Other files are never extracted
// Memory is only used for the files being extracted
Ok(())
}Extract only needed files to minimize both memory and disk I/O.
use std::fs::File;
use zip::ZipArchive;
fn zip64_handling() -> Result<(), Box<dyn std::error::Error>> {
// Zip64 supports archives > 4GB and files > 4GB
// The zip crate handles this transparently
let file = File::open("large_archive.zip")?;
let mut archive = ZipArchive::new(file)?;
// Large files are reported correctly
for i in 0..archive.len() {
let entry = archive.by_index(i)?;
// size() returns u64 for Zip64 support
if entry.size() > 1_000_000_000 {
println!("Large file: {} ({} GB)",
entry.name(),
entry.size() / 1_000_000_000);
}
}
Ok(())
}Zip64 is handled transparently for large archives and files.
use std::fs::File;
use std::io::{self, Write};
use zip::{ZipWriter, write::FileOptions};
use flate2::Compression;
fn write_large_archive() -> Result<(), Box<dyn std::error::Error>> {
let file = File::create("output.zip")?;
let mut zip = ZipWriter::new(file);
let options = FileOptions::default()
.compression_method(zip::CompressionMethod::Deflated)
.compression_level(Some(Compression::default().level()));
// Write files one at a time
for i in 0..1000 {
let filename = format!("file_{}.txt", i);
zip.start_file(filename, options)?;
// Stream data into the archive
let data = format!("Content for file {}", i);
zip.write_all(data.as_bytes())?;
// Each file is compressed and written
// Memory is freed after each file
}
zip.finish()?;
Ok(())
}Writing uses streaming too; files are compressed and written incrementally.
use std::fs::File;
use std::io::{self, BufReader, Read, Write};
use zip::{ZipWriter, write::FileOptions};
fn stream_large_file_to_zip() -> Result<(), Box<dyn std::error::Error>> {
let output = File::create("archive.zip")?;
let mut zip = ZipWriter::new(output);
let options = FileOptions::default()
.compression_method(zip::CompressionMethod::Deflated);
// Add a large file without loading it into memory
zip.start_file("large_file.bin", options)?;
let input = File::open("large_source.bin")?;
let mut reader = BufReader::new(input);
// Stream from source file to zip
// Data is compressed in chunks
io::copy(&mut reader, &mut zip)?;
zip.finish()?;
Ok(())
}Large files can be streamed into archives without full memory loading.
use std::fs::File;
use std::io::Read;
use zip::ZipArchive;
fn memory_comparison() -> Result<(), Box<dyn std::error::Error>> {
// Scenario: 1GB archive with 100MB compressed files
// Approach 1: Load all into memory (BAD for large archives)
// Memory: ~1GB (archive) + decompression buffers
fn load_all_memory() -> Vec<u8> {
let mut file = File::open("archive.zip").unwrap();
let mut buffer = Vec::new();
file.read_to_end(&mut buffer).unwrap();
buffer // Entire archive in memory
}
// Approach 2: ZipArchive with streaming (GOOD)
// Memory: ~metadata + one file's decompression buffer
fn streaming_approach() -> Result<(), Box<dyn std::error::Error>> {
let file = File::open("archive.zip")?;
let mut archive = ZipArchive::new(file)?;
// Memory: central directory only
let mut entry = archive.by_index(0)?;
let mut output = File::create("output.bin")?;
std::io::copy(&mut entry, &mut output)?;
// Memory: ~8KB buffer for copy
Ok(())
}
// Approach 3: Process in chunks (BEST for computation)
// Memory: constant buffer size
fn chunk_processing() -> Result<(), Box<dyn std::error::Error>> {
let file = File::open("archive.zip")?;
let mut archive = ZipArchive::new(file)?;
let mut entry = archive.by_index(0)?;
let mut buffer = [0u8; 8192];
loop {
let n = entry.read(&mut buffer)?;
if n == 0 { break; }
// Process buffer[n]
}
Ok(())
}
Ok(())
}The streaming approach scales to any file size with constant memory.
ZipArchive handles large files efficiently through a design that separates metadata loading from content extraction:
Memory model:
ZipArchive::new() loads only the central directory (file metadata), not file contentsExtraction approaches:
| Approach | Memory Usage | When to Use |
|----------|--------------|-------------|
| read_to_end() | Full uncompressed size | Small files only |
| io::copy() to file | ~8KB buffer | Large files to disk |
| Chunk processing | Configurable buffer | Processing without disk I/O |
Key constraints:
ZipFile can be active per ZipArchive instanceZipArchive instancesBest practices:
io::copy() or chunk processingBufReader/BufWriter for better I/O performanceThe zip crate's design ensures that memory usage scales with file count, not file size, making it suitable for archives of any size.