Examples
Basic Usage
Here’s a simple example of using PagedList:
Basic Usage Example
"""
Example usage of disk-backed list package.
"""
from paged_list import PagedList
def basic_example():
"""Run a basic example of PagedList usage."""
print("Basic PagedList Example")
print("=========================")
# Create a PagedList with small chunk size for demonstration
cl = PagedList(chunk_size=5, disk_path="example_data")
# Add some data
print("Adding 12 items...")
for i in range(12):
cl.append(
{
"id": i,
"name": f"item_{i}",
"value": i * 10,
"metadata": {"category": "test", "active": True},
}
)
print(f"List length: {len(cl)}")
print(f"Chunks created: {cl.total_chunks}")
print(f"Items in memory: {cl.in_memory_count}")
# Demonstrate indexing
print(f"\nFirst item: {cl[0]}")
print(f"Last item: {cl[-1]}")
print(f"Item at index 5: {cl[5]}")
# Demonstrate slicing
print(f"\nSlice [3:7]: {cl[3:7]}")
# Demonstrate updating
print("\nUpdating item at index 5...")
cl[5] = {"id": 5, "name": "updated_item_5", "value": 999, "updated": True}
print(f"Updated item: {cl[5]}")
# Demonstrate serialization
print("\nSerializing complex data types...")
cl.serialize()
print("Serialization complete - boolean and dict values are now JSON strings")
# Demonstrate mapping
print("\nApplying transformation to double all values...")
def double_value(record):
if "value" in record and isinstance(record["value"], (int, float)):
record = record.copy()
record["value"] *= 2
return record
old_value = cl[0]["value"] if "value" in cl[0] else "N/A"
cl.map(double_value)
new_value = cl[0]["value"] if "value" in cl[0] else "N/A"
print(f"First item value changed from {old_value} to {new_value}")
# Clean up
print("\nCleaning up...")
cl.cleanup_chunks()
print("Example completed!")
def performance_example():
"""Demonstrate performance with larger dataset."""
print("Performance Example")
print("===================")
# Create a larger list
cl = PagedList(chunk_size=10000, disk_path="perf_data")
print("Adding 50,000 items...")
for i in range(50000):
cl.append(
{
"id": i,
"timestamp": f"2024-01-{(i % 30) + 1:02d}",
"value": i * 3.14,
"category": f"category_{i % 10}",
}
)
print(f"Total items: {len(cl)}")
print(f"Chunks on disk: {cl.total_chunks}")
print(f"Items in memory: {cl.in_memory_count}")
# Test random access performance
import time
start_time = time.time()
# Access 1000 random items
import random
for _ in range(1000):
idx = random.randint(0, len(cl) - 1)
_ = cl[idx]
end_time = time.time()
print(f"Time to access 1000 random items: {end_time - start_time:.3f} seconds")
# Clean up
cl.cleanup_chunks()
print("Performance example completed!")
def main():
"""Run the examples."""
basic_example()
print("\n" + "=" * 50 + "\n")
performance_example()
if __name__ == "__main__":
main()
Advanced Usage
Context Manager
from paged_list import PagedList
with PagedList(chunk_size=10000) as pl:
# Add lots of data
for i in range(1000000):
pl.append({"data": f"item_{i}"})
# Process data
result = pl[500000:500010]
# Automatic cleanup on exit
Custom Serialization
# Serialize complex Python objects to JSON strings
pl.append({
"id": 1,
"metadata": {"tags": ["python", "data"], "active": True},
"scores": [1.2, 3.4, 5.6],
})
pl.serialize() # Converts lists, dicts, and bools to JSON strings
Parallel Processing
# Process data in parallel across chunks
def process_record(record):
record["processed"] = True
record["timestamp"] = "2024-01-01"
return record
pl.map(process_record, max_workers=4) # Use 4 threads
Use Cases
Large Dataset Processing: Handle datasets that don’t fit in memory
Data Pipelines: Process streaming data with automatic disk overflow
ETL Operations: Transform large datasets chunk by chunk
Data Analysis: Analyze large datasets without memory constraints
Caching: Implement persistent, memory-efficient caches