Rag Construction
Build RAG systems for construction knowledge bases. Create searchable AI-powered construction document systems
安装 / 下载方式
TotalClaw CLI推荐
totalclaw install clawskills:datadrivenconstruction~rag-constructioncURL直接下载,无需登录
curl -fsSL https://skills.taituai.com/api/skills/clawskills%3Adatadrivenconstruction~rag-construction/file -o rag-construction.mdGit 仓库获取源码
git clone https://github.com/openclaw/skills/commit/b6089a95f6533552033a2c3b28aed33c3ca8ec40# RAG Construction
## Overview
Based on DDC methodology (Chapter 2.3), this skill builds Retrieval-Augmented Generation (RAG) systems for construction knowledge bases, enabling semantic search and AI-powered question answering over construction documents.
**Book Reference:** "Pandas DataFrame и LLM ChatGPT" / "Pandas DataFrame and LLM ChatGPT"
## Quick Start
```python
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional, Any, Callable
from datetime import datetime
import json
import hashlib
import re
class DocumentType(Enum):
"""Types of construction documents"""
SPECIFICATION = "specification"
DRAWING = "drawing"
CONTRACT = "contract"
RFI = "rfi"
SUBMITTAL = "submittal"
CHANGE_ORDER = "change_order"
MEETING_MINUTES = "meeting_minutes"
DAILY_REPORT = "daily_report"
SAFETY_REPORT = "safety_report"
INSPECTION = "inspection"
MANUAL = "manual"
STANDARD = "standard"
class ChunkingStrategy(Enum):
"""Text chunking strategies"""
FIXED_SIZE = "fixed_size"
PARAGRAPH = "paragraph"
SECTION = "section"
SEMANTIC = "semantic"
SENTENCE = "sentence"
@dataclass
class DocumentChunk:
"""A chunk of document text"""
id: str
document_id: str
content: str
metadata: Dict[str, Any]
embedding: Optional[List[float]] = None
token_count: int = 0
position: int = 0
@dataclass
class Document:
"""Construction document"""
id: str
title: str
doc_type: DocumentType
content: str
source: str
metadata: Dict[str, Any] = field(default_factory=dict)
chunks: List[DocumentChunk] = field(default_factory=list)
created_at: datetime = field(default_factory=datetime.now)
@dataclass
class SearchResult:
"""Search result from vector store"""
chunk: DocumentChunk
score: float
document_title: str
doc_type: DocumentType
@dataclass
class RAGResponse:
"""Response from RAG system"""
query: str
answer: str
sources: List[SearchResult]
confidence: float
tokens_used: int
class TextChunker:
"""Split documents into chunks for embedding"""
def __init__(
self,
strategy: ChunkingStrategy = ChunkingStrategy.PARAGRAPH,
chunk_size: int = 500,
chunk_overlap: int = 50
):
self.strategy = strategy
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_document(self, document: Document) -> List[DocumentChunk]:
"""Split document into chunks"""
if self.strategy == ChunkingStrategy.FIXED_SIZE:
return self._chunk_fixed_size(document)
elif self.strategy == ChunkingStrategy.PARAGRAPH:
return self._chunk_by_paragraph(document)
elif self.strategy == ChunkingStrategy.SECTION:
return self._chunk_by_section(document)
elif self.strategy == ChunkingStrategy.SENTENCE:
return self._chunk_by_sentence(document)
else:
return self._chunk_fixed_size(document)
def _chunk_fixed_size(self, document: Document) -> List[DocumentChunk]:
"""Chunk by fixed character size with overlap"""
chunks = []
text = document.content
start = 0
position = 0
while start < len(text):
end = start + self.chunk_size
# Find word boundary
if end < len(text):
while end > start and text[end] not in ' \n\t':
end -= 1
chunk_text = text[start:end].strip()
if chunk_text:
chunk_id = self._generate_chunk_id(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=chunk_text,
metadata={
"doc_type": document.doc_type.value,
"title": document.title,
**document.metadata
},
token_count=len(chunk_text.split()),
position=position
))
position += 1
start = end - self.chunk_overlap
if start >= len(text):
break
return chunks
def _chunk_by_paragraph(self, document: Document) -> List[DocumentChunk]:
"""Chunk by paragraphs"""
chunks = []
paragraphs = document.content.split('\n\n')
current_chunk = ""
position = 0
for para in paragraphs:
para = para.strip()
if not para:
continue
if len(current_chunk) + len(para) < self.chunk_size:
current_chunk += "\n\n" + para if current_chunk else para
else:
if current_chunk:
chunk_id = self._generate_chunk_id(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_chunk,
metadata={
"doc_type": document.doc_type.value,
"title": document.title,
**document.metadata
},
token_count=len(current_chunk.split()),
position=position
))
position += 1
current_chunk = para
# Add remaining content
if current_chunk:
chunk_id = self._generate_chunk_id(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_chunk,
metadata={
"doc_type": document.doc_type.value,
"title": document.title,
**document.metadata
},
token_count=len(current_chunk.split()),
position=position
))
return chunks
def _chunk_by_section(self, document: Document) -> List[DocumentChunk]:
"""Chunk by document sections (headers)"""
# Split by common section patterns
section_pattern = r'\n(?=(?:\d+\.|\d+\s|SECTION|ARTICLE|PART)\s+[A-Z])'
sections = re.split(section_pattern, document.content)
chunks = []
for position, section in enumerate(sections):
section = section.strip()
if section:
# If section is too large, further split it
if len(section) > self.chunk_size * 2:
sub_chunker = TextChunker(ChunkingStrategy.PARAGRAPH, self.chunk_size)
sub_doc = Document(
id=f"{document.id}_sec{position}",
title=document.title,
doc_type=document.doc_type,
content=section,
source=document.source,
metadata=document.metadata
)
sub_chunks = sub_chunker.chunk_document(sub_doc)
for i, chunk in enumerate(sub_chunks):
chunk.id = self._generate_chunk_id(document.id, position * 100 + i)
chunk.position = position * 100 + i
chunks.extend(sub_chunks)
else:
chunk_id = self._generate_chunk_id(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=section,