Rag Construction

TotalClaw 作者 datadrivenconstruction v2.1.0

构建建筑知识库的 RAG 系统。创建可搜索的人工智能驱动的施工文档系统

源码 ↗

安装 / 下载方式

TotalClaw CLI推荐
totalclaw install totalclaw:datadrivenconstruction~rag-construction
cURL直接下载,无需登录
curl -fsSL https://skills.taituai.com/api/skills/totalclaw%3Adatadrivenconstruction~rag-construction/file -o rag-construction.md
Git 仓库获取源码
git clone https://github.com/openclaw/skills/commit/b6089a95f6533552033a2c3b28aed33c3ca8ec40
## 概述(中文)

构建建筑知识库的 RAG 系统。创建可搜索的人工智能驱动的施工文档系统

## 原文

# RAG Construction

## Overview

Based on DDC methodology (Chapter 2.3), this skill builds Retrieval-Augmented Generation (RAG) systems for construction knowledge bases, enabling semantic search and AI-powered question answering over construction documents.

**Book Reference:** "Pandas DataFrame и LLM ChatGPT" / "Pandas DataFrame and LLM ChatGPT"

## Quick Start

```python
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional, Any, Callable
from datetime import datetime
import json
import hashlib
import re

class DocumentType(Enum):
    """Types of construction documents"""
    SPECIFICATION = "specification"
    DRAWING = "drawing"
    CONTRACT = "contract"
    RFI = "rfi"
    SUBMITTAL = "submittal"
    CHANGE_ORDER = "change_order"
    MEETING_MINUTES = "meeting_minutes"
    DAILY_REPORT = "daily_report"
    SAFETY_REPORT = "safety_report"
    INSPECTION = "inspection"
    MANUAL = "manual"
    STANDARD = "standard"

class ChunkingStrategy(Enum):
    """Text chunking strategies"""
    FIXED_SIZE = "fixed_size"
    PARAGRAPH = "paragraph"
    SECTION = "section"
    SEMANTIC = "semantic"
    SENTENCE = "sentence"

@dataclass
class DocumentChunk:
    """A chunk of document text"""
    id: str
    document_id: str
    content: str
    metadata: Dict[str, Any]
    embedding: Optional[List[float]] = None
    token_count: int = 0
    position: int = 0

@dataclass
class Document:
    """Construction document"""
    id: str
    title: str
    doc_type: DocumentType
    content: str
    source: str
    metadata: Dict[str, Any] = field(default_factory=dict)
    chunks: List[DocumentChunk] = field(default_factory=list)
    created_at: datetime = field(default_factory=datetime.now)

@dataclass
class SearchResult:
    """Search result from vector store"""
    chunk: DocumentChunk
    score: float
    document_title: str
    doc_type: DocumentType

@dataclass
class RAGResponse:
    """Response from RAG system"""
    query: str
    answer: str
    sources: List[SearchResult]
    confidence: float
    tokens_used: int


class TextChunker:
    """Split documents into chunks for embedding"""

    def __init__(
        self,
        strategy: ChunkingStrategy = ChunkingStrategy.PARAGRAPH,
        chunk_size: int = 500,
        chunk_overlap: int = 50
    ):
        self.strategy = strategy
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def chunk_document(self, document: Document) -> List[DocumentChunk]:
        """Split document into chunks"""
        if self.strategy == ChunkingStrategy.FIXED_SIZE:
            return self._chunk_fixed_size(document)
        elif self.strategy == ChunkingStrategy.PARAGRAPH:
            return self._chunk_by_paragraph(document)
        elif self.strategy == ChunkingStrategy.SECTION:
            return self._chunk_by_section(document)
        elif self.strategy == ChunkingStrategy.SENTENCE:
            return self._chunk_by_sentence(document)
        else:
            return self._chunk_fixed_size(document)

    def _chunk_fixed_size(self, document: Document) -> List[DocumentChunk]:
        """Chunk by fixed character size with overlap"""
        chunks = []
        text = document.content
        start = 0
        position = 0

        while start < len(text):
            end = start + self.chunk_size

            # Find word boundary
            if end < len(text):
                while end > start and text[end] not in ' \n\t':
                    end -= 1

            chunk_text = text[start:end].strip()
            if chunk_text:
                chunk_id = self._generate_chunk_id(document.id, position)
                chunks.append(DocumentChunk(
                    id=chunk_id,
                    document_id=document.id,
                    content=chunk_text,
                    metadata={
                        "doc_type": document.doc_type.value,
                        "title": document.title,
                        **document.metadata
                    },
                    token_count=len(chunk_text.split()),
                    position=position
                ))
                position += 1

            start = end - self.chunk_overlap
            if start >= len(text):
                break

        return chunks

    def _chunk_by_paragraph(self, document: Document) -> List[DocumentChunk]:
        """Chunk by paragraphs"""
        chunks = []
        paragraphs = document.content.split('\n\n')
        current_chunk = ""
        position = 0

        for para in paragraphs:
            para = para.strip()
            if not para:
                continue

            if len(current_chunk) + len(para) < self.chunk_size:
                current_chunk += "\n\n" + para if current_chunk else para
            else:
                if current_chunk:
                    chunk_id = self._generate_chunk_id(document.id, position)
                    chunks.append(DocumentChunk(
                        id=chunk_id,
                        document_id=document.id,
                        content=current_chunk,
                        metadata={
                            "doc_type": document.doc_type.value,
                            "title": document.title,
                            **document.metadata
                        },
                        token_count=len(current_chunk.split()),
                        position=position
                    ))
                    position += 1
                current_chunk = para

        # Add remaining content
        if current_chunk:
            chunk_id = self._generate_chunk_id(document.id, position)
            chunks.append(DocumentChunk(
                id=chunk_id,
                document_id=document.id,
                content=current_chunk,
                metadata={
                    "doc_type": document.doc_type.value,
                    "title": document.title,
                    **document.metadata
                },
                token_count=len(current_chunk.split()),
                position=position
            ))

        return chunks

    def _chunk_by_section(self, document: Document) -> List[DocumentChunk]:
        """Chunk by document sections (headers)"""
        # Split by common section patterns
        section_pattern = r'\n(?=(?:\d+\.|\d+\s|SECTION|ARTICLE|PART)\s+[A-Z])'
        sections = re.split(section_pattern, document.content)

        chunks = []
        for position, section in enumerate(sections):
            section = section.strip()
            if section:
                # If section is too large, further split it
                if len(section) > self.chunk_size * 2:
                    sub_chunker = TextChunker(ChunkingStrategy.PARAGRAPH, self.chunk_size)
                    sub_doc = Document(
                        id=f"{document.id}_sec{position}",
                        title=document.title,
                        doc_type=document.doc_type,
                        content=section,
                        source=document.source,
                        metadata=document.metadata
                    )
                    sub_chunks = sub_chunker.chunk_document(sub_doc)
                    for i, chunk in enumerate(sub_chunks):
                        chunk.id = self._generate_chunk_id(document.id, position * 100 + i)
                        chunk.position = position * 100 + i
                    chunks.extend(sub_chunks)
                else:
                    chunk_id = self._generate_chunk_id(document.id, position)
                    chunks.append(DocumentChunk(
                        id=chunk_id,
                        document_id=document.id,
                        content=section,
                        metadata={
                            "doc_type": document.doc_type.value,
                            "title": document.title,
                            **docum