Best Practices for Building Secure, Isolated Multi-Tenant Agent Systems
Multi-tenant architectures are essential for SaaS platforms hosting AI agents. Each tenant (organization) needs complete isolation, security, and the ability to scale independently. Here's how to build enterprise-grade multi-tenant agent systems.
The Multi-Tenancy Challenge
Building multi-tenant agent systems involves unique challenges:
- Complete Isolation: One tenant's agents must never access another's data
- Resource Fairness: Prevent one tenant from monopolizing resources
- Independent Scaling: Each tenant should scale based on their needs
- Security: Enterprise-grade security and compliance
- Customization: Tenants need to customize agent behavior
Architecture Patterns
1. Namespace Isolation
class TenantNamespace { constructor(tenantId) { this.tenantId = tenantId; this.namespace = `tenant.${tenantId}`; } // All subjects are automatically namespaced getSubject(subject) { return `${this.namespace}.${subject}`; } // Agents can only subscribe within their namespace subscribe(pattern, handler) { const namespacedPattern = this.getSubject(pattern); return this.nc.subscribe(namespacedPattern, handler); } publish(subject, data) { const namespacedSubject = this.getSubject(subject); return this.nc.publish(namespacedSubject, { ...data, tenantId: this.tenantId, timestamp: Date.now() }); } } // Usage const tenant = new TenantNamespace('org_12345'); await tenant.publish('agents.task', taskData); // Actually publishes to: tenant.org_12345.agents.task
2. Resource Pool Isolation
class TenantResourcePool { constructor(tenantId, limits) { this.tenantId = tenantId; this.limits = { maxAgents: limits.maxAgents || 100, maxCPU: limits.maxCPU || 10, maxMemory: limits.maxMemory || '10GB', maxMessages: limits.maxMessages || 100000, ...limits }; this.usage = { agents: 0, cpu: 0, memory: 0, messages: 0 }; } async requestAgent(agentConfig) { // Check limits before allocation if (this.usage.agents >= this.limits.maxAgents) { throw new Error('Agent limit reached for tenant'); } const resources = this.calculateResources(agentConfig); if (!this.canAllocate(resources)) { throw new Error('Insufficient resources'); } // Allocate resources this.usage.agents++; this.usage.cpu += resources.cpu; this.usage.memory += resources.memory; // Create isolated agent return this.createIsolatedAgent(agentConfig); } createIsolatedAgent(config) { return new Agent({ ...config, tenantId: this.tenantId, namespace: `tenant.${this.tenantId}`, resources: { cpu: config.cpu || 0.1, memory: config.memory || '100MB' }, sandbox: true // Run in isolated environment }); } }
Data Isolation Strategies
1. Database Isolation
class TenantDataStore { constructor(tenantId) { this.tenantId = tenantId; this.tablePrefix = `tenant_${tenantId}_`; } // Separate tables per tenant async createTenantSchema() { const tables = [ 'agents', 'knowledge_base', 'workflows', 'audit_logs' ]; for (const table of tables) { await this.db.createTable(`${this.tablePrefix}${table}`, { // Schema definition }); } } // All queries automatically scoped to tenant async query(table, conditions) { return this.db.query(`${this.tablePrefix}${table}`, { ...conditions, tenantId: this.tenantId // Extra safety check }); } // Row-level security for shared tables async secureQuery(table, conditions) { return this.db.query(table, { ...conditions, tenantId: this.tenantId, // Mandatory tenant filter deletedAt: null }); } }
2. Encryption at Rest
class TenantEncryption { constructor(tenantId) { this.tenantId = tenantId; // Each tenant gets unique encryption key this.key = this.deriveKey(tenantId); } async encrypt(data) { const iv = crypto.randomBytes(16); const cipher = crypto.createCipheriv('aes-256-gcm', this.key, iv); const encrypted = Buffer.concat([ cipher.update(JSON.stringify(data)), cipher.final() ]); return { data: encrypted.toString('base64'), iv: iv.toString('base64'), tag: cipher.getAuthTag().toString('base64') }; } async decrypt(encryptedData) { const decipher = crypto.createDecipheriv( 'aes-256-gcm', this.key, Buffer.from(encryptedData.iv, 'base64') ); decipher.setAuthTag(Buffer.from(encryptedData.tag, 'base64')); const decrypted = Buffer.concat([ decipher.update(Buffer.from(encryptedData.data, 'base64')), decipher.final() ]); return JSON.parse(decrypted.toString()); } }
Authentication and Authorization
1. JWT-Based Tenant Isolation
class TenantAuth { async generateToken(userId, tenantId, permissions) { const payload = { sub: userId, tenant: tenantId, permissions: permissions, iat: Date.now(), exp: Date.now() + (24 * 60 * 60 * 1000) // 24 hours }; return jwt.sign(payload, this.getSecret(tenantId), { algorithm: 'RS256', issuer: 'artcafe.ai', audience: tenantId }); } async validateRequest(req) { const token = this.extractToken(req); try { const decoded = jwt.verify(token, this.publicKey, { algorithms: ['RS256'], issuer: 'artcafe.ai' }); // Verify tenant access if (decoded.tenant !== req.tenantId) { throw new Error('Tenant mismatch'); } // Check permissions if (!this.hasPermission(decoded.permissions, req.action)) { throw new Error('Insufficient permissions'); } return decoded; } catch (error) { throw new Error('Authentication failed'); } } }
2. API Key Management
class TenantAPIKeys { async createAPIKey(tenantId, name, permissions) { const key = this.generateSecureKey(); const hashedKey = await bcrypt.hash(key, 10); await this.store.save({ id: generateId(), tenantId, name, keyHash: hashedKey, permissions, created: Date.now(), lastUsed: null, active: true }); // Return key only once return { key: `${tenantId}_${key}`, name, permissions }; } async validateAPIKey(apiKey) { const [tenantId, key] = apiKey.split('_'); const storedKeys = await this.store.find({ tenantId, active: true }); for (const stored of storedKeys) { if (await bcrypt.compare(key, stored.keyHash)) { // Update last used await this.store.update(stored.id, { lastUsed: Date.now() }); return { valid: true, tenantId, permissions: stored.permissions }; } } return { valid: false }; } }
Resource Management and Quotas
1. Rate Limiting per Tenant
class TenantRateLimiter { constructor() { this.limits = new Map(); } async checkLimit(tenantId, resource, cost = 1) { const key = `${tenantId}:${resource}`; const limit = await this.getLimit(tenantId, resource); const current = await this.redis.incr(key); if (current === 1) { // First request in window await this.redis.expire(key, limit.window); } if (current > limit.max) { const ttl = await this.redis.ttl(key); throw new Error(`Rate limit exceeded. Retry after ${ttl} seconds`); } return { limit: limit.max, remaining: limit.max - current, reset: Date.now() + (limit.window * 1000) }; } async getLimit(tenantId, resource) { // Tenant-specific limits const tenantPlan = await this.getTenantPlan(tenantId); return { max: tenantPlan.limits[resource] || 1000, window: 60 // seconds }; } }
2. Usage Tracking and Billing
class TenantUsageTracker { async trackUsage(tenantId, metric, value = 1) { const timestamp = Date.now(); const hour = Math.floor(timestamp / 3600000) * 3600000; // Atomic increment await this.redis.hincrby( `usage:${tenantId}:${hour}`, metric, value ); // Check quotas await this.checkQuotas(tenantId, metric); // Real-time analytics await this.publishMetric(tenantId, metric, value); } async getUsageReport(tenantId, startDate, endDate) { const hours = this.getHoursBetween(startDate, endDate); const usage = {}; for (const hour of hours) { const hourUsage = await this.redis.hgetall( `usage:${tenantId}:${hour}` ); for (const [metric, value] of Object.entries(hourUsage)) { usage[metric] = (usage[metric] || 0) + parseInt(value); } } return { tenantId, period: { start: startDate, end: endDate }, usage, cost: this.calculateCost(usage) }; } }
Compliance and Auditing
1. Audit Logging
class TenantAuditLog { async log(event) { const auditEntry = { id: generateId(), tenantId: event.tenantId, timestamp: Date.now(), userId: event.userId, action: event.action, resource: event.resource, result: event.result, metadata: event.metadata, ip: event.ip, userAgent: event.userAgent }; // Write to append-only log await this.appendLog(auditEntry); // Index for searching await this.indexLog(auditEntry); // Real-time compliance alerts await this.checkCompliance(auditEntry); } async query(tenantId, filters) { // Ensure tenant isolation in queries const query = { tenantId, // Mandatory ...filters }; return this.search(query); } async export(tenantId, format = 'json') { // Export tenant's audit logs for compliance const logs = await this.getAllLogs(tenantId); switch (format) { case 'csv': return this.toCSV(logs); case 'json': return JSON.stringify(logs, null, 2); case 'siem': return this.toSIEMFormat(logs); } } }
2. Data Residency
class TenantDataResidency { constructor(tenantId, region) { this.tenantId = tenantId; this.region = region; this.allowedRegions = this.getComplianceRegions(region); } async storeData(data, options = {}) { // Ensure data stays in compliant regions const storageRegion = options.region || this.region; if (!this.allowedRegions.includes(storageRegion)) { throw new Error(`Data residency violation: Cannot store in ${storageRegion}`); } // Encrypt before storage const encrypted = await this.encrypt(data); // Store with metadata return this.regionalStore[storageRegion].save({ tenantId: this.tenantId, data: encrypted, region: storageRegion, classification: options.classification || 'general', retention: options.retention || '7years' }); } getComplianceRegions(primaryRegion) { // GDPR compliance if (['eu-west-1', 'eu-central-1'].includes(primaryRegion)) { return ['eu-west-1', 'eu-central-1', 'eu-north-1']; } // Other compliance requirements return [primaryRegion]; } }
Customization and Configuration
1. Tenant-Specific Configurations
class TenantConfig { constructor(tenantId) { this.tenantId = tenantId; this.cache = new Map(); } async get(key, defaultValue = null) { // Check cache first if (this.cache.has(key)) { return this.cache.get(key); } // Load from database const config = await this.db.findOne({ tenantId: this.tenantId, key: key }); const value = config ? config.value : defaultValue; this.cache.set(key, value); return value; } async set(key, value) { await this.db.upsert({ tenantId: this.tenantId, key: key, value: value, updated: Date.now() }); this.cache.set(key, value); // Notify agents of config change await this.publish('config.changed', { key, value }); } async getAgentConfig(agentType) { // Hierarchical config with overrides const defaults = await this.get('agents.defaults', {}); const specific = await this.get(`agents.${agentType}`, {}); return { ...this.globalDefaults[agentType], ...defaults, ...specific }; } }
2. Custom Agent Behaviors
class TenantAgentCustomization { async deployCustomAgent(tenantId, agentDefinition) { // Validate agent definition await this.validateDefinition(agentDefinition); // Create sandboxed environment const sandbox = await this.createSandbox(tenantId); // Deploy with restrictions const agent = await sandbox.deploy({ ...agentDefinition, resources: this.enforceResourceLimits(agentDefinition.resources), permissions: this.filterPermissions(agentDefinition.permissions), namespace: `tenant.${tenantId}.custom` }); // Monitor for compliance this.monitorAgent(agent); return agent; } async loadCustomTools(tenantId) { // Tenant-specific tools/integrations const tools = await this.db.find({ tenantId, type: 'tool', active: true }); return tools.map(tool => ({ name: tool.name, execute: this.sandboxTool(tool.code), permissions: tool.permissions })); } sandboxTool(code) { // Run custom code in isolated environment return async (input) => { const vm = new VM({ timeout: 5000, sandbox: { input, console: this.safeConsole, // Limited API access api: this.createSafeAPI() } }); return vm.run(code); }; } }
Scaling Strategies
1. Tenant-Based Sharding
class TenantSharding { constructor(shardMap) { this.shardMap = shardMap; } getShardForTenant(tenantId) { // Consistent hashing for tenant->shard mapping const hash = this.hashTenant(tenantId); const shardIndex = hash % this.shardMap.length; return this.shardMap[shardIndex]; } async routeRequest(tenantId, request) { const shard = this.getShardForTenant(tenantId); return shard.handle(request); } async rebalance() { // Move tenants between shards for load balancing const loadPerShard = await this.calculateLoads(); for (const [shard, load] of loadPerShard) { if (load > this.threshold) { await this.migrateTenants(shard); } } } }
2. Elastic Scaling
class TenantAutoScaler { async monitorAndScale() { const tenants = await this.getActiveTenants(); for (const tenant of tenants) { const metrics = await this.getMetrics(tenant.id); if (this.shouldScaleUp(metrics)) { await this.scaleUp(tenant.id, metrics); } else if (this.shouldScaleDown(metrics)) { await this.scaleDown(tenant.id, metrics); } } } shouldScaleUp(metrics) { return ( metrics.cpu > 0.8 || metrics.memory > 0.8 || metrics.queueDepth > 1000 || metrics.responseTime > 500 ); } async scaleUp(tenantId, metrics) { // Add more agent instances const currentAgents = await this.getAgentCount(tenantId); const newAgents = Math.ceil(currentAgents * 0.5); // 50% increase for (let i = 0; i < newAgents; i++) { await this.deployAgent(tenantId, { type: 'worker', autoScale: true }); } // Notify monitoring await this.notifyScaling(tenantId, 'up', newAgents); } }
Best Practices Summary
- Isolation First: Design with complete tenant isolation from the start
- Zero Trust: Never trust tenant-provided data without validation
- Resource Limits: Enforce hard limits to prevent noisy neighbors
- Audit Everything: Comprehensive logging for security and compliance
- Encryption Everywhere: Encrypt in transit and at rest
- Automated Monitoring: Detect and respond to issues automatically
- Graceful Degradation: Isolate failures to affected tenants only
- Cost Attribution: Track usage precisely for accurate billing
Building multi-tenant agent systems requires careful attention to isolation, security, and scalability. With these patterns and practices, you can create enterprise-grade platforms that serve thousands of organizations securely and efficiently.