Handle messy CSVs with encoding detection, delimiter inference, and malformed row recovery.
View on GitHubmajesticlabs-dev/majestic-marketplace
majestic-data
January 24, 2026
Select agents to install to:
npx add-skill https://github.com/majesticlabs-dev/majestic-marketplace/blob/main/plugins/majestic-data/skills/csv-wrangler/SKILL.md -a claude-code --skill csv-wranglerInstallation paths:
.claude/skills/csv-wrangler/# CSV-Wrangler
Patterns for handling real-world messy CSV files.
## Encoding Detection
```python
import chardet
def detect_encoding(file_path: str, sample_size: int = 10000) -> str:
"""Detect file encoding from sample."""
with open(file_path, 'rb') as f:
raw = f.read(sample_size)
result = chardet.detect(raw)
return result['encoding']
def read_with_encoding(path: str) -> pd.DataFrame:
"""Read CSV with auto-detected encoding."""
encoding = detect_encoding(path)
# Common fallback chain
encodings = [encoding, 'utf-8', 'latin-1', 'cp1252']
for enc in encodings:
try:
return pd.read_csv(path, encoding=enc)
except UnicodeDecodeError:
continue
# Last resort: ignore errors
return pd.read_csv(path, encoding='utf-8', errors='ignore')
```
## Delimiter Detection
```python
import csv
def detect_delimiter(file_path: str) -> str:
"""Detect CSV delimiter from file sample."""
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
sample = f.read(4096)
sniffer = csv.Sniffer()
try:
dialect = sniffer.sniff(sample, delimiters=',;\t|')
return dialect.delimiter
except csv.Error:
# Count occurrences and pick most common
counts = {d: sample.count(d) for d in [',', ';', '\t', '|']}
return max(counts, key=counts.get)
def read_with_delimiter_detection(path: str) -> pd.DataFrame:
"""Read CSV with auto-detected delimiter."""
delimiter = detect_delimiter(path)
return pd.read_csv(path, sep=delimiter)
```
## Handling Malformed Rows
```python
def read_with_error_handling(path: str) -> tuple[pd.DataFrame, list[dict]]:
"""Read CSV, capturing malformed rows separately."""
good_rows = []
bad_rows = []
with open(path, 'r', encoding='utf-8', errors='replace') as f:
reader = csv.reader(f)
header = next(reader)
expected_cols = len(header)
for line_num, row in enumer