Best Practices for Building Secure, Isolated Multi-Tenant Agent Systems

Multi-tenant architectures are essential for SaaS platforms hosting AI agents. Each tenant (organization) needs complete isolation, security, and the ability to scale independently. Here's how to build enterprise-grade multi-tenant agent systems.

The Multi-Tenancy Challenge

Building multi-tenant agent systems involves unique challenges:

Complete Isolation: One tenant's agents must never access another's data
Resource Fairness: Prevent one tenant from monopolizing resources
Independent Scaling: Each tenant should scale based on their needs
Security: Enterprise-grade security and compliance
Customization: Tenants need to customize agent behavior

Architecture Patterns

1. Namespace Isolation

class TenantNamespace {
  constructor(tenantId) {
    this.tenantId = tenantId;
    this.namespace = `tenant.${tenantId}`;
  }

  // All subjects are automatically namespaced
  getSubject(subject) {
    return `${this.namespace}.${subject}`;
  }

  // Agents can only subscribe within their namespace
  subscribe(pattern, handler) {
    const namespacedPattern = this.getSubject(pattern);
    return this.nc.subscribe(namespacedPattern, handler);
  }

  publish(subject, data) {
    const namespacedSubject = this.getSubject(subject);
    return this.nc.publish(namespacedSubject, {
      ...data,
      tenantId: this.tenantId,
      timestamp: Date.now()
    });
  }
}

// Usage
const tenant = new TenantNamespace('org_12345');
await tenant.publish('agents.task', taskData);
// Actually publishes to: tenant.org_12345.agents.task

2. Resource Pool Isolation

class TenantResourcePool {
  constructor(tenantId, limits) {
    this.tenantId = tenantId;
    this.limits = {
      maxAgents: limits.maxAgents || 100,
      maxCPU: limits.maxCPU || 10,
      maxMemory: limits.maxMemory || '10GB',
      maxMessages: limits.maxMessages || 100000,
      ...limits
    };
    
    this.usage = {
      agents: 0,
      cpu: 0,
      memory: 0,
      messages: 0
    };
  }

  async requestAgent(agentConfig) {
    // Check limits before allocation
    if (this.usage.agents >= this.limits.maxAgents) {
      throw new Error('Agent limit reached for tenant');
    }

    const resources = this.calculateResources(agentConfig);
    
    if (!this.canAllocate(resources)) {
      throw new Error('Insufficient resources');
    }

    // Allocate resources
    this.usage.agents++;
    this.usage.cpu += resources.cpu;
    this.usage.memory += resources.memory;

    // Create isolated agent
    return this.createIsolatedAgent(agentConfig);
  }

  createIsolatedAgent(config) {
    return new Agent({
      ...config,
      tenantId: this.tenantId,
      namespace: `tenant.${this.tenantId}`,
      resources: {
        cpu: config.cpu || 0.1,
        memory: config.memory || '100MB'
      },
      sandbox: true  // Run in isolated environment
    });
  }
}

Data Isolation Strategies

1. Database Isolation

class TenantDataStore {
  constructor(tenantId) {
    this.tenantId = tenantId;
    this.tablePrefix = `tenant_${tenantId}_`;
  }

  // Separate tables per tenant
  async createTenantSchema() {
    const tables = [
      'agents',
      'knowledge_base',
      'workflows',
      'audit_logs'
    ];

    for (const table of tables) {
      await this.db.createTable(`${this.tablePrefix}${table}`, {
        // Schema definition
      });
    }
  }

  // All queries automatically scoped to tenant
  async query(table, conditions) {
    return this.db.query(`${this.tablePrefix}${table}`, {
      ...conditions,
      tenantId: this.tenantId  // Extra safety check
    });
  }

  // Row-level security for shared tables
  async secureQuery(table, conditions) {
    return this.db.query(table, {
      ...conditions,
      tenantId: this.tenantId,  // Mandatory tenant filter
      deletedAt: null
    });
  }
}

2. Encryption at Rest

class TenantEncryption {
  constructor(tenantId) {
    this.tenantId = tenantId;
    // Each tenant gets unique encryption key
    this.key = this.deriveKey(tenantId);
  }

  async encrypt(data) {
    const iv = crypto.randomBytes(16);
    const cipher = crypto.createCipheriv('aes-256-gcm', this.key, iv);
    
    const encrypted = Buffer.concat([
      cipher.update(JSON.stringify(data)),
      cipher.final()
    ]);
    
    return {
      data: encrypted.toString('base64'),
      iv: iv.toString('base64'),
      tag: cipher.getAuthTag().toString('base64')
    };
  }

  async decrypt(encryptedData) {
    const decipher = crypto.createDecipheriv(
      'aes-256-gcm',
      this.key,
      Buffer.from(encryptedData.iv, 'base64')
    );
    
    decipher.setAuthTag(Buffer.from(encryptedData.tag, 'base64'));
    
    const decrypted = Buffer.concat([
      decipher.update(Buffer.from(encryptedData.data, 'base64')),
      decipher.final()
    ]);
    
    return JSON.parse(decrypted.toString());
  }
}

Authentication and Authorization

1. JWT-Based Tenant Isolation

class TenantAuth {
  async generateToken(userId, tenantId, permissions) {
    const payload = {
      sub: userId,
      tenant: tenantId,
      permissions: permissions,
      iat: Date.now(),
      exp: Date.now() + (24 * 60 * 60 * 1000) // 24 hours
    };

    return jwt.sign(payload, this.getSecret(tenantId), {
      algorithm: 'RS256',
      issuer: 'artcafe.ai',
      audience: tenantId
    });
  }

  async validateRequest(req) {
    const token = this.extractToken(req);
    
    try {
      const decoded = jwt.verify(token, this.publicKey, {
        algorithms: ['RS256'],
        issuer: 'artcafe.ai'
      });

      // Verify tenant access
      if (decoded.tenant !== req.tenantId) {
        throw new Error('Tenant mismatch');
      }

      // Check permissions
      if (!this.hasPermission(decoded.permissions, req.action)) {
        throw new Error('Insufficient permissions');
      }

      return decoded;
    } catch (error) {
      throw new Error('Authentication failed');
    }
  }
}

2. API Key Management

class TenantAPIKeys {
  async createAPIKey(tenantId, name, permissions) {
    const key = this.generateSecureKey();
    const hashedKey = await bcrypt.hash(key, 10);

    await this.store.save({
      id: generateId(),
      tenantId,
      name,
      keyHash: hashedKey,
      permissions,
      created: Date.now(),
      lastUsed: null,
      active: true
    });

    // Return key only once
    return {
      key: `${tenantId}_${key}`,
      name,
      permissions
    };
  }

  async validateAPIKey(apiKey) {
    const [tenantId, key] = apiKey.split('_');
    
    const storedKeys = await this.store.find({
      tenantId,
      active: true
    });

    for (const stored of storedKeys) {
      if (await bcrypt.compare(key, stored.keyHash)) {
        // Update last used
        await this.store.update(stored.id, {
          lastUsed: Date.now()
        });

        return {
          valid: true,
          tenantId,
          permissions: stored.permissions
        };
      }
    }

    return { valid: false };
  }
}

Resource Management and Quotas

1. Rate Limiting per Tenant

class TenantRateLimiter {
  constructor() {
    this.limits = new Map();
  }

  async checkLimit(tenantId, resource, cost = 1) {
    const key = `${tenantId}:${resource}`;
    const limit = await this.getLimit(tenantId, resource);
    
    const current = await this.redis.incr(key);
    
    if (current === 1) {
      // First request in window
      await this.redis.expire(key, limit.window);
    }

    if (current > limit.max) {
      const ttl = await this.redis.ttl(key);
      throw new Error(`Rate limit exceeded. Retry after ${ttl} seconds`);
    }

    return {
      limit: limit.max,
      remaining: limit.max - current,
      reset: Date.now() + (limit.window * 1000)
    };
  }

  async getLimit(tenantId, resource) {
    // Tenant-specific limits
    const tenantPlan = await this.getTenantPlan(tenantId);
    
    return {
      max: tenantPlan.limits[resource] || 1000,
      window: 60 // seconds
    };
  }
}

2. Usage Tracking and Billing

class TenantUsageTracker {
  async trackUsage(tenantId, metric, value = 1) {
    const timestamp = Date.now();
    const hour = Math.floor(timestamp / 3600000) * 3600000;

    // Atomic increment
    await this.redis.hincrby(
      `usage:${tenantId}:${hour}`,
      metric,
      value
    );

    // Check quotas
    await this.checkQuotas(tenantId, metric);

    // Real-time analytics
    await this.publishMetric(tenantId, metric, value);
  }

  async getUsageReport(tenantId, startDate, endDate) {
    const hours = this.getHoursBetween(startDate, endDate);
    const usage = {};

    for (const hour of hours) {
      const hourUsage = await this.redis.hgetall(
        `usage:${tenantId}:${hour}`
      );
      
      for (const [metric, value] of Object.entries(hourUsage)) {
        usage[metric] = (usage[metric] || 0) + parseInt(value);
      }
    }

    return {
      tenantId,
      period: { start: startDate, end: endDate },
      usage,
      cost: this.calculateCost(usage)
    };
  }
}

Compliance and Auditing

1. Audit Logging

class TenantAuditLog {
  async log(event) {
    const auditEntry = {
      id: generateId(),
      tenantId: event.tenantId,
      timestamp: Date.now(),
      userId: event.userId,
      action: event.action,
      resource: event.resource,
      result: event.result,
      metadata: event.metadata,
      ip: event.ip,
      userAgent: event.userAgent
    };

    // Write to append-only log
    await this.appendLog(auditEntry);

    // Index for searching
    await this.indexLog(auditEntry);

    // Real-time compliance alerts
    await this.checkCompliance(auditEntry);
  }

  async query(tenantId, filters) {
    // Ensure tenant isolation in queries
    const query = {
      tenantId,  // Mandatory
      ...filters
    };

    return this.search(query);
  }

  async export(tenantId, format = 'json') {
    // Export tenant's audit logs for compliance
    const logs = await this.getAllLogs(tenantId);
    
    switch (format) {
      case 'csv':
        return this.toCSV(logs);
      case 'json':
        return JSON.stringify(logs, null, 2);
      case 'siem':
        return this.toSIEMFormat(logs);
    }
  }
}

2. Data Residency

class TenantDataResidency {
  constructor(tenantId, region) {
    this.tenantId = tenantId;
    this.region = region;
    this.allowedRegions = this.getComplianceRegions(region);
  }

  async storeData(data, options = {}) {
    // Ensure data stays in compliant regions
    const storageRegion = options.region || this.region;
    
    if (!this.allowedRegions.includes(storageRegion)) {
      throw new Error(`Data residency violation: Cannot store in ${storageRegion}`);
    }

    // Encrypt before storage
    const encrypted = await this.encrypt(data);
    
    // Store with metadata
    return this.regionalStore[storageRegion].save({
      tenantId: this.tenantId,
      data: encrypted,
      region: storageRegion,
      classification: options.classification || 'general',
      retention: options.retention || '7years'
    });
  }

  getComplianceRegions(primaryRegion) {
    // GDPR compliance
    if (['eu-west-1', 'eu-central-1'].includes(primaryRegion)) {
      return ['eu-west-1', 'eu-central-1', 'eu-north-1'];
    }
    
    // Other compliance requirements
    return [primaryRegion];
  }
}

Customization and Configuration

1. Tenant-Specific Configurations

class TenantConfig {
  constructor(tenantId) {
    this.tenantId = tenantId;
    this.cache = new Map();
  }

  async get(key, defaultValue = null) {
    // Check cache first
    if (this.cache.has(key)) {
      return this.cache.get(key);
    }

    // Load from database
    const config = await this.db.findOne({
      tenantId: this.tenantId,
      key: key
    });

    const value = config ? config.value : defaultValue;
    this.cache.set(key, value);
    
    return value;
  }

  async set(key, value) {
    await this.db.upsert({
      tenantId: this.tenantId,
      key: key,
      value: value,
      updated: Date.now()
    });

    this.cache.set(key, value);
    
    // Notify agents of config change
    await this.publish('config.changed', { key, value });
  }

  async getAgentConfig(agentType) {
    // Hierarchical config with overrides
    const defaults = await this.get('agents.defaults', {});
    const specific = await this.get(`agents.${agentType}`, {});
    
    return {
      ...this.globalDefaults[agentType],
      ...defaults,
      ...specific
    };
  }
}

2. Custom Agent Behaviors

class TenantAgentCustomization {
  async deployCustomAgent(tenantId, agentDefinition) {
    // Validate agent definition
    await this.validateDefinition(agentDefinition);
    
    // Create sandboxed environment
    const sandbox = await this.createSandbox(tenantId);
    
    // Deploy with restrictions
    const agent = await sandbox.deploy({
      ...agentDefinition,
      resources: this.enforceResourceLimits(agentDefinition.resources),
      permissions: this.filterPermissions(agentDefinition.permissions),
      namespace: `tenant.${tenantId}.custom`
    });

    // Monitor for compliance
    this.monitorAgent(agent);
    
    return agent;
  }

  async loadCustomTools(tenantId) {
    // Tenant-specific tools/integrations
    const tools = await this.db.find({
      tenantId,
      type: 'tool',
      active: true
    });

    return tools.map(tool => ({
      name: tool.name,
      execute: this.sandboxTool(tool.code),
      permissions: tool.permissions
    }));
  }

  sandboxTool(code) {
    // Run custom code in isolated environment
    return async (input) => {
      const vm = new VM({
        timeout: 5000,
        sandbox: {
          input,
          console: this.safeConsole,
          // Limited API access
          api: this.createSafeAPI()
        }
      });

      return vm.run(code);
    };
  }
}

Scaling Strategies

1. Tenant-Based Sharding

class TenantSharding {
  constructor(shardMap) {
    this.shardMap = shardMap;
  }

  getShardForTenant(tenantId) {
    // Consistent hashing for tenant->shard mapping
    const hash = this.hashTenant(tenantId);
    const shardIndex = hash % this.shardMap.length;
    return this.shardMap[shardIndex];
  }

  async routeRequest(tenantId, request) {
    const shard = this.getShardForTenant(tenantId);
    return shard.handle(request);
  }

  async rebalance() {
    // Move tenants between shards for load balancing
    const loadPerShard = await this.calculateLoads();
    
    for (const [shard, load] of loadPerShard) {
      if (load > this.threshold) {
        await this.migrateTenants(shard);
      }
    }
  }
}

2. Elastic Scaling

class TenantAutoScaler {
  async monitorAndScale() {
    const tenants = await this.getActiveTenants();
    
    for (const tenant of tenants) {
      const metrics = await this.getMetrics(tenant.id);
      
      if (this.shouldScaleUp(metrics)) {
        await this.scaleUp(tenant.id, metrics);
      } else if (this.shouldScaleDown(metrics)) {
        await this.scaleDown(tenant.id, metrics);
      }
    }
  }

  shouldScaleUp(metrics) {
    return (
      metrics.cpu > 0.8 ||
      metrics.memory > 0.8 ||
      metrics.queueDepth > 1000 ||
      metrics.responseTime > 500
    );
  }

  async scaleUp(tenantId, metrics) {
    // Add more agent instances
    const currentAgents = await this.getAgentCount(tenantId);
    const newAgents = Math.ceil(currentAgents * 0.5); // 50% increase
    
    for (let i = 0; i < newAgents; i++) {
      await this.deployAgent(tenantId, {
        type: 'worker',
        autoScale: true
      });
    }

    // Notify monitoring
    await this.notifyScaling(tenantId, 'up', newAgents);
  }
}

Best Practices Summary

Isolation First: Design with complete tenant isolation from the start
Zero Trust: Never trust tenant-provided data without validation
Resource Limits: Enforce hard limits to prevent noisy neighbors
Audit Everything: Comprehensive logging for security and compliance
Encryption Everywhere: Encrypt in transit and at rest
Automated Monitoring: Detect and respond to issues automatically
Graceful Degradation: Isolate failures to affected tenants only
Cost Attribution: Track usage precisely for accurate billing

Building multi-tenant agent systems requires careful attention to isolation, security, and scalability. With these patterns and practices, you can create enterprise-grade platforms that serve thousands of organizations securely and efficiently.

Building Multi-Tenant AI Agent Systems