Data Type Classifier
Classify construction data by type (structured, unstructured, semi-structured). Analyze data sources and recommend appropriate storage/processing methods
安装 / 下载方式
TotalClaw CLI推荐
totalclaw install clawskills:datadrivenconstruction~data-type-classifiercURL直接下载,无需登录
curl -fsSL https://skills.taituai.com/api/skills/clawskills%3Adatadrivenconstruction~data-type-classifier/file -o data-type-classifier.mdGit 仓库获取源码
git clone https://github.com/openclaw/skills/commit/d9f41f8612cc0d2fd50d5b95487739c05d060347# Data Type Classifier
## Overview
Based on DDC methodology (Chapter 2.1), this skill classifies construction data by type, analyzes data sources, and recommends appropriate storage, processing, and integration methods.
**Book Reference:** "Типы данных в строительстве" / "Data Types in Construction"
## Quick Start
```python
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional, Any, Tuple
from datetime import datetime
import json
import re
import mimetypes
class DataStructure(Enum):
"""Data structure classification"""
STRUCTURED = "structured" # Tables, databases, spreadsheets
SEMI_STRUCTURED = "semi_structured" # JSON, XML, IFC
UNSTRUCTURED = "unstructured" # Documents, images, videos
GEOMETRIC = "geometric" # CAD, BIM geometry
TEMPORAL = "temporal" # Time-series, schedules
SPATIAL = "spatial" # GIS, coordinates
class DataFormat(Enum):
"""Common construction data formats"""
# Structured
CSV = "csv"
EXCEL = "excel"
SQL = "sql"
PARQUET = "parquet"
# Semi-structured
JSON = "json"
XML = "xml"
IFC = "ifc"
BCF = "bcf"
# Unstructured
PDF = "pdf"
DOCX = "docx"
IMAGE = "image"
VIDEO = "video"
# Geometric
DWG = "dwg"
DXF = "dxf"
RVT = "rvt"
NWD = "nwd"
OBJ = "obj"
STL = "stl"
# Schedule
MPP = "mpp"
P6 = "p6"
XER = "xer"
class StorageRecommendation(Enum):
"""Storage system recommendations"""
RELATIONAL_DB = "relational_database"
DOCUMENT_DB = "document_database"
OBJECT_STORAGE = "object_storage"
GRAPH_DB = "graph_database"
TIME_SERIES_DB = "time_series_database"
VECTOR_DB = "vector_database"
FILE_SYSTEM = "file_system"
DATA_LAKE = "data_lake"
@dataclass
class DataCharacteristics:
"""Characteristics of a data source"""
has_schema: bool
has_relationships: bool
is_queryable: bool
is_binary: bool
has_geometry: bool
has_temporal: bool
has_text_content: bool
avg_record_size: Optional[int] = None # bytes
estimated_volume: Optional[str] = None # small/medium/large/huge
update_frequency: Optional[str] = None
@dataclass
class DataClassification:
"""Classification result for a data source"""
source_name: str
source_type: str
detected_format: DataFormat
structure: DataStructure
characteristics: DataCharacteristics
storage_recommendation: StorageRecommendation
processing_tools: List[str]
integration_options: List[str]
quality_considerations: List[str]
confidence: float
@dataclass
class ClassificationReport:
"""Complete classification report"""
total_sources: int
classifications: List[DataClassification]
summary_by_structure: Dict[str, int]
summary_by_format: Dict[str, int]
storage_recommendations: Dict[str, List[str]]
integration_strategy: Dict[str, str]
class DataTypeClassifier:
"""
Classify construction data by type and recommend processing methods.
Based on DDC methodology Chapter 2.1.
"""
def __init__(self):
self.format_signatures = self._define_format_signatures()
self.structure_mapping = self._define_structure_mapping()
self.storage_mapping = self._define_storage_mapping()
self.processing_tools = self._define_processing_tools()
def _define_format_signatures(self) -> Dict[str, Dict]:
"""Define format detection signatures"""
return {
# File extensions
".csv": {"format": DataFormat.CSV, "structure": DataStructure.STRUCTURED},
".xlsx": {"format": DataFormat.EXCEL, "structure": DataStructure.STRUCTURED},
".xls": {"format": DataFormat.EXCEL, "structure": DataStructure.STRUCTURED},
".json": {"format": DataFormat.JSON, "structure": DataStructure.SEMI_STRUCTURED},
".xml": {"format": DataFormat.XML, "structure": DataStructure.SEMI_STRUCTURED},
".ifc": {"format": DataFormat.IFC, "structure": DataStructure.SEMI_STRUCTURED},
".bcf": {"format": DataFormat.BCF, "structure": DataStructure.SEMI_STRUCTURED},
".pdf": {"format": DataFormat.PDF, "structure": DataStructure.UNSTRUCTURED},
".docx": {"format": DataFormat.DOCX, "structure": DataStructure.UNSTRUCTURED},
".dwg": {"format": DataFormat.DWG, "structure": DataStructure.GEOMETRIC},
".dxf": {"format": DataFormat.DXF, "structure": DataStructure.GEOMETRIC},
".rvt": {"format": DataFormat.RVT, "structure": DataStructure.GEOMETRIC},
".nwd": {"format": DataFormat.NWD, "structure": DataStructure.GEOMETRIC},
".mpp": {"format": DataFormat.MPP, "structure": DataStructure.TEMPORAL},
".xer": {"format": DataFormat.XER, "structure": DataStructure.TEMPORAL},
".parquet": {"format": DataFormat.PARQUET, "structure": DataStructure.STRUCTURED},
".jpg": {"format": DataFormat.IMAGE, "structure": DataStructure.UNSTRUCTURED},
".png": {"format": DataFormat.IMAGE, "structure": DataStructure.UNSTRUCTURED},
".mp4": {"format": DataFormat.VIDEO, "structure": DataStructure.UNSTRUCTURED}
}
def _define_structure_mapping(self) -> Dict[DataStructure, Dict]:
"""Define characteristics for each structure type"""
return {
DataStructure.STRUCTURED: {
"description": "Tabular data with fixed schema",
"examples": ["Cost databases", "Material lists", "Vendor records"],
"query_support": True,
"schema_required": True
},
DataStructure.SEMI_STRUCTURED: {
"description": "Hierarchical data with flexible schema",
"examples": ["BIM models (IFC)", "API responses", "Configuration files"],
"query_support": True,
"schema_required": False
},
DataStructure.UNSTRUCTURED: {
"description": "No predefined schema or format",
"examples": ["Contracts", "Photos", "Emails", "Meeting notes"],
"query_support": False,
"schema_required": False
},
DataStructure.GEOMETRIC: {
"description": "3D/2D geometric and spatial data",
"examples": ["CAD drawings", "BIM geometry", "Point clouds"],
"query_support": True,
"schema_required": True
},
DataStructure.TEMPORAL: {
"description": "Time-based sequential data",
"examples": ["Schedules", "Progress data", "Sensor readings"],
"query_support": True,
"schema_required": True
},
DataStructure.SPATIAL: {
"description": "Geographic and location data",
"examples": ["Site maps", "GPS tracks", "GIS layers"],
"query_support": True,
"schema_required": True
}
}
def _define_storage_mapping(self) -> Dict[DataStructure, StorageRecommendation]:
"""Map data structures to storage recommendations"""
return {
DataStructure.STRUCTURED: StorageRecommendation.RELATIONAL_DB,
DataStructure.SEMI_STRUCTURED: StorageRecommendation.DOCUMENT_DB,
DataStructure.UNSTRUCTURED: StorageRecommendation.OBJECT_STORAGE,
DataStructure.GEOMETRIC: StorageRecommendation.FILE_SYSTEM,
DataStructure.TEMPORAL: StorageRecommendation.TIME_SERIES_DB,
DataStructure.SPATIAL: StorageRecommendation.RELATIONAL_DB
}
def _define_processing_tools(self)