data-validation

TotalClaw 作者 totalclaw

使用跨语言和格式的架构验证数据。在定义 JSON 架构、使用 Zod (TypeScript) 或 Pydantic (Python)、验证 API 请求/响应形状、检查 CSV/JSON 数据完整性或在服务之间设置数据契约时使用。

安装 / 下载方式

TotalClaw CLI推荐
totalclaw install totalclaw:totalclaw~gitgoodordietrying-data-validation
cURL直接下载,无需登录
curl -fsSL https://skills.taituai.com/api/skills/totalclaw%3Atotalclaw~gitgoodordietrying-data-validation/file -o gitgoodordietrying-data-validation.md
## 概述(中文)

使用跨语言和格式的架构验证数据。在定义 JSON 架构、使用 Zod (TypeScript) 或 Pydantic (Python)、验证 API 请求/响应形状、检查 CSV/JSON 数据完整性或在服务之间设置数据契约时使用。

## 原文

# Data Validation

Schema-based data validation across languages and formats. Covers JSON Schema, Zod (TypeScript), Pydantic (Python), API boundary validation, data contracts, and integrity checking.

## When to Use

- Defining the shape of API request/response bodies
- Validating user input before processing
- Setting up data contracts between services
- Checking CSV/JSON file integrity before import
- Migrating data (did the ETL preserve everything?)
- Generating types or documentation from schemas

## JSON Schema

### Basic schema

```json
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "type": "object",
  "required": ["name", "email", "age"],
  "properties": {
    "name": {
      "type": "string",
      "minLength": 1,
      "maxLength": 100
    },
    "email": {
      "type": "string",
      "format": "email"
    },
    "age": {
      "type": "integer",
      "minimum": 0,
      "maximum": 150
    },
    "role": {
      "type": "string",
      "enum": ["user", "admin", "moderator"],
      "default": "user"
    },
    "tags": {
      "type": "array",
      "items": { "type": "string" },
      "uniqueItems": true,
      "maxItems": 10
    },
    "address": {
      "type": "object",
      "properties": {
        "street": { "type": "string" },
        "city": { "type": "string" },
        "zip": { "type": "string", "pattern": "^\\d{5}(-\\d{4})?$" }
      },
      "required": ["street", "city"]
    }
  },
  "additionalProperties": false
}
```

### Common patterns

```json
// Nullable field
{ "type": ["string", "null"] }

// Union type (string or number)
{ "oneOf": [{ "type": "string" }, { "type": "number" }] }

// Conditional: if role is admin, require permissions
{
  "if": { "properties": { "role": { "const": "admin" } } },
  "then": { "required": ["permissions"] }
}

// Pattern properties (dynamic keys)
{
  "type": "object",
  "patternProperties": {
    "^env_": { "type": "string" }
  }
}

// Reusable definitions
{
  "$defs": {
    "address": {
      "type": "object",
      "properties": {
        "street": { "type": "string" },
        "city": { "type": "string" }
      }
    }
  },
  "properties": {
    "home": { "$ref": "#/$defs/address" },
    "work": { "$ref": "#/$defs/address" }
  }
}
```

### Validate with command line

```bash
# Using ajv-cli (Node.js)
npx ajv-cli validate -s schema.json -d data.json

# Using jsonschema (Python)
pip install jsonschema
python3 -c "
import json, jsonschema
schema = json.load(open('schema.json'))
data = json.load(open('data.json'))
jsonschema.validate(data, schema)
print('Valid')
"

# Validate multiple files
for f in data/*.json; do
  npx ajv-cli validate -s schema.json -d "$f" 2>&1 || echo "INVALID: $f"
done
```

## Zod (TypeScript)

### Basic schemas

```typescript
import { z } from 'zod';

// Primitives
const nameSchema = z.string().min(1).max(100);
const ageSchema = z.number().int().min(0).max(150);
const emailSchema = z.string().email();
const urlSchema = z.string().url();

// Objects
const userSchema = z.object({
  name: z.string().min(1),
  email: z.string().email(),
  age: z.number().int().min(0),
  role: z.enum(['user', 'admin', 'moderator']).default('user'),
  tags: z.array(z.string()).max(10).default([]),
  createdAt: z.string().datetime(),
});

// Infer TypeScript type from schema
type User = z.infer<typeof userSchema>;
// { name: string; email: string; age: number; role: "user" | "admin" | "moderator"; ... }

// Validate
const result = userSchema.safeParse(data);
if (result.success) {
  console.log(result.data); // typed as User
} else {
  console.log(result.error.issues); // validation errors
}

// Parse (throws on invalid)
const user = userSchema.parse(data);
```

### Advanced patterns

```typescript
// Optional and nullable
const schema = z.object({
  name: z.string(),
  nickname: z.string().optional(),       // string | undefined
  middleName: z.string().nullable(),     // string | null
  suffix: z.string().nullish(),          // string | null | undefined
});

// Transforms (validate then transform)
const dateSchema = z.string().datetime().transform(s => new Date(s));
const trimmed = z.string().trim().toLowerCase();
const parsed = z.string().transform(s => parseInt(s, 10)).pipe(z.number().int());

// Discriminated unions (tagged unions)
const eventSchema = z.discriminatedUnion('type', [
  z.object({ type: z.literal('click'), x: z.number(), y: z.number() }),
  z.object({ type: z.literal('keypress'), key: z.string() }),
  z.object({ type: z.literal('scroll'), delta: z.number() }),
]);

// Recursive types
const categorySchema: z.ZodType<Category> = z.object({
  name: z.string(),
  children: z.lazy(() => z.array(categorySchema)).default([]),
});

// Refinements (custom validation)
const passwordSchema = z.string()
  .min(8)
  .refine(s => /[A-Z]/.test(s), 'Must contain uppercase')
  .refine(s => /[0-9]/.test(s), 'Must contain digit')
  .refine(s => /[^a-zA-Z0-9]/.test(s), 'Must contain special character');

// Extend/merge objects
const baseUser = z.object({ name: z.string(), email: z.string() });
const adminUser = baseUser.extend({ permissions: z.array(z.string()) });

// Pick/omit
const createUser = userSchema.omit({ createdAt: true });
const userSummary = userSchema.pick({ name: true, email: true });

// Passthrough (allow extra fields)
const flexible = userSchema.passthrough();

// Strip unknown fields
const strict = userSchema.strict(); // Error on extra fields
```

### API validation with Zod

```typescript
// Express middleware
import { z } from 'zod';

const createUserBody = z.object({
  name: z.string().min(1),
  email: z.string().email(),
  password: z.string().min(8),
});

app.post('/api/users', (req, res) => {
  const result = createUserBody.safeParse(req.body);
  if (!result.success) {
    return res.status(400).json({ errors: result.error.issues });
  }
  const { name, email, password } = result.data;
  // ... create user
});

// Query parameter validation
const listParams = z.object({
  page: z.coerce.number().int().min(1).default(1),
  limit: z.coerce.number().int().min(1).max(100).default(20),
  sort: z.enum(['newest', 'oldest', 'name']).default('newest'),
  q: z.string().optional(),
});

app.get('/api/users', (req, res) => {
  const params = listParams.parse(req.query);
  // params.page is a number, params.sort is typed
});
```

## Pydantic (Python)

### Basic models

```python
from pydantic import BaseModel, Field, EmailStr, field_validator
from typing import Optional
from datetime import datetime
from enum import Enum

class Role(str, Enum):
    USER = "user"
    ADMIN = "admin"
    MODERATOR = "moderator"

class Address(BaseModel):
    street: str
    city: str
    zip_code: str = Field(pattern=r"^\d{5}(-\d{4})?$")

class User(BaseModel):
    name: str = Field(min_length=1, max_length=100)
    email: EmailStr
    age: int = Field(ge=0, le=150)
    role: Role = Role.USER
    tags: list[str] = Field(default_factory=list, max_length=10)
    address: Optional[Address] = None
    created_at: datetime = Field(default_factory=datetime.now)

    @field_validator("name")
    @classmethod
    def name_must_not_be_empty(cls, v: str) -> str:
        if not v.strip():
            raise ValueError("name cannot be blank")
        return v.strip()

# Validate
user = User(name="Alice", email="alice@example.com", age=30)
print(user.model_dump())      # dict
print(user.model_dump_json())  # JSON string

# Validation errors
try:
    User(name="", email="bad", age=-1)
except Exception as e:
    print(e)  # Detailed validation errors
```

### Advanced patterns

```python
from pydantic import BaseModel, model_validator, ConfigDict
from typing import Literal, Union, Annotated

# Discriminated union
class ClickEvent(BaseModel):
    type: Literal["click"]
    x: int
    y: int

class KeypressEvent(BaseModel):
    type: Literal["keypress"]
    key: s