Data Type Classifier

ClawSkills 作者 datadrivenconstruction v2.1.0

Classify construction data by type (structured, unstructured, semi-structured). Analyze data sources and recommend appropriate storage/processing methods

源码 ↗

安装 / 下载方式

TotalClaw CLI推荐
totalclaw install clawskills:datadrivenconstruction~data-type-classifier
cURL直接下载,无需登录
curl -fsSL https://skills.taituai.com/api/skills/clawskills%3Adatadrivenconstruction~data-type-classifier/file -o data-type-classifier.md
Git 仓库获取源码
git clone https://github.com/openclaw/skills/commit/d9f41f8612cc0d2fd50d5b95487739c05d060347
# Data Type Classifier

## Overview

Based on DDC methodology (Chapter 2.1), this skill classifies construction data by type, analyzes data sources, and recommends appropriate storage, processing, and integration methods.

**Book Reference:** "Типы данных в строительстве" / "Data Types in Construction"

## Quick Start

```python
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional, Any, Tuple
from datetime import datetime
import json
import re
import mimetypes

class DataStructure(Enum):
    """Data structure classification"""
    STRUCTURED = "structured"           # Tables, databases, spreadsheets
    SEMI_STRUCTURED = "semi_structured" # JSON, XML, IFC
    UNSTRUCTURED = "unstructured"       # Documents, images, videos
    GEOMETRIC = "geometric"             # CAD, BIM geometry
    TEMPORAL = "temporal"               # Time-series, schedules
    SPATIAL = "spatial"                 # GIS, coordinates

class DataFormat(Enum):
    """Common construction data formats"""
    # Structured
    CSV = "csv"
    EXCEL = "excel"
    SQL = "sql"
    PARQUET = "parquet"

    # Semi-structured
    JSON = "json"
    XML = "xml"
    IFC = "ifc"
    BCF = "bcf"

    # Unstructured
    PDF = "pdf"
    DOCX = "docx"
    IMAGE = "image"
    VIDEO = "video"

    # Geometric
    DWG = "dwg"
    DXF = "dxf"
    RVT = "rvt"
    NWD = "nwd"
    OBJ = "obj"
    STL = "stl"

    # Schedule
    MPP = "mpp"
    P6 = "p6"
    XER = "xer"

class StorageRecommendation(Enum):
    """Storage system recommendations"""
    RELATIONAL_DB = "relational_database"
    DOCUMENT_DB = "document_database"
    OBJECT_STORAGE = "object_storage"
    GRAPH_DB = "graph_database"
    TIME_SERIES_DB = "time_series_database"
    VECTOR_DB = "vector_database"
    FILE_SYSTEM = "file_system"
    DATA_LAKE = "data_lake"

@dataclass
class DataCharacteristics:
    """Characteristics of a data source"""
    has_schema: bool
    has_relationships: bool
    is_queryable: bool
    is_binary: bool
    has_geometry: bool
    has_temporal: bool
    has_text_content: bool
    avg_record_size: Optional[int] = None  # bytes
    estimated_volume: Optional[str] = None  # small/medium/large/huge
    update_frequency: Optional[str] = None

@dataclass
class DataClassification:
    """Classification result for a data source"""
    source_name: str
    source_type: str
    detected_format: DataFormat
    structure: DataStructure
    characteristics: DataCharacteristics
    storage_recommendation: StorageRecommendation
    processing_tools: List[str]
    integration_options: List[str]
    quality_considerations: List[str]
    confidence: float

@dataclass
class ClassificationReport:
    """Complete classification report"""
    total_sources: int
    classifications: List[DataClassification]
    summary_by_structure: Dict[str, int]
    summary_by_format: Dict[str, int]
    storage_recommendations: Dict[str, List[str]]
    integration_strategy: Dict[str, str]


class DataTypeClassifier:
    """
    Classify construction data by type and recommend processing methods.
    Based on DDC methodology Chapter 2.1.
    """

    def __init__(self):
        self.format_signatures = self._define_format_signatures()
        self.structure_mapping = self._define_structure_mapping()
        self.storage_mapping = self._define_storage_mapping()
        self.processing_tools = self._define_processing_tools()

    def _define_format_signatures(self) -> Dict[str, Dict]:
        """Define format detection signatures"""
        return {
            # File extensions
            ".csv": {"format": DataFormat.CSV, "structure": DataStructure.STRUCTURED},
            ".xlsx": {"format": DataFormat.EXCEL, "structure": DataStructure.STRUCTURED},
            ".xls": {"format": DataFormat.EXCEL, "structure": DataStructure.STRUCTURED},
            ".json": {"format": DataFormat.JSON, "structure": DataStructure.SEMI_STRUCTURED},
            ".xml": {"format": DataFormat.XML, "structure": DataStructure.SEMI_STRUCTURED},
            ".ifc": {"format": DataFormat.IFC, "structure": DataStructure.SEMI_STRUCTURED},
            ".bcf": {"format": DataFormat.BCF, "structure": DataStructure.SEMI_STRUCTURED},
            ".pdf": {"format": DataFormat.PDF, "structure": DataStructure.UNSTRUCTURED},
            ".docx": {"format": DataFormat.DOCX, "structure": DataStructure.UNSTRUCTURED},
            ".dwg": {"format": DataFormat.DWG, "structure": DataStructure.GEOMETRIC},
            ".dxf": {"format": DataFormat.DXF, "structure": DataStructure.GEOMETRIC},
            ".rvt": {"format": DataFormat.RVT, "structure": DataStructure.GEOMETRIC},
            ".nwd": {"format": DataFormat.NWD, "structure": DataStructure.GEOMETRIC},
            ".mpp": {"format": DataFormat.MPP, "structure": DataStructure.TEMPORAL},
            ".xer": {"format": DataFormat.XER, "structure": DataStructure.TEMPORAL},
            ".parquet": {"format": DataFormat.PARQUET, "structure": DataStructure.STRUCTURED},
            ".jpg": {"format": DataFormat.IMAGE, "structure": DataStructure.UNSTRUCTURED},
            ".png": {"format": DataFormat.IMAGE, "structure": DataStructure.UNSTRUCTURED},
            ".mp4": {"format": DataFormat.VIDEO, "structure": DataStructure.UNSTRUCTURED}
        }

    def _define_structure_mapping(self) -> Dict[DataStructure, Dict]:
        """Define characteristics for each structure type"""
        return {
            DataStructure.STRUCTURED: {
                "description": "Tabular data with fixed schema",
                "examples": ["Cost databases", "Material lists", "Vendor records"],
                "query_support": True,
                "schema_required": True
            },
            DataStructure.SEMI_STRUCTURED: {
                "description": "Hierarchical data with flexible schema",
                "examples": ["BIM models (IFC)", "API responses", "Configuration files"],
                "query_support": True,
                "schema_required": False
            },
            DataStructure.UNSTRUCTURED: {
                "description": "No predefined schema or format",
                "examples": ["Contracts", "Photos", "Emails", "Meeting notes"],
                "query_support": False,
                "schema_required": False
            },
            DataStructure.GEOMETRIC: {
                "description": "3D/2D geometric and spatial data",
                "examples": ["CAD drawings", "BIM geometry", "Point clouds"],
                "query_support": True,
                "schema_required": True
            },
            DataStructure.TEMPORAL: {
                "description": "Time-based sequential data",
                "examples": ["Schedules", "Progress data", "Sensor readings"],
                "query_support": True,
                "schema_required": True
            },
            DataStructure.SPATIAL: {
                "description": "Geographic and location data",
                "examples": ["Site maps", "GPS tracks", "GIS layers"],
                "query_support": True,
                "schema_required": True
            }
        }

    def _define_storage_mapping(self) -> Dict[DataStructure, StorageRecommendation]:
        """Map data structures to storage recommendations"""
        return {
            DataStructure.STRUCTURED: StorageRecommendation.RELATIONAL_DB,
            DataStructure.SEMI_STRUCTURED: StorageRecommendation.DOCUMENT_DB,
            DataStructure.UNSTRUCTURED: StorageRecommendation.OBJECT_STORAGE,
            DataStructure.GEOMETRIC: StorageRecommendation.FILE_SYSTEM,
            DataStructure.TEMPORAL: StorageRecommendation.TIME_SERIES_DB,
            DataStructure.SPATIAL: StorageRecommendation.RELATIONAL_DB
        }

    def _define_processing_tools(self)