Files
eleakxir/back/search/dataleak/dataleak.go
2026-01-06 16:06:27 +01:00

313 lines
7.8 KiB
Go

package dataleak
import (
"fmt"
"path/filepath"
"slices"
"strconv"
"strings"
"time"
"github.com/anotherhadi/eleakxir/backend/server"
"github.com/charmbracelet/log"
)
type LeakResult struct {
Duration time.Duration
Rows []map[string]string
Error string
Inactive bool
LimitHit bool // Whether the search hit the limit
}
func Search(s *server.Server, queryText, column string, exactMatch bool, includeFolders []bool) LeakResult {
if len(*(s.Dataleaks)) == 0 {
return LeakResult{
Inactive: true,
}
}
now := time.Now()
result := LeakResult{}
sqlQuery := buildSqlQuery(s, queryText, column, exactMatch, includeFolders)
if strings.HasPrefix(sqlQuery, "error:") {
result.Error = strings.TrimPrefix(sqlQuery, "error: ")
return result
}
if s.Settings.Debug {
log.Info("New query:", "query", sqlQuery)
}
rows, err := s.Duckdb.Query(sqlQuery)
if err != nil {
result.Error = err.Error()
return result
}
defer rows.Close()
cols, err := rows.Columns()
if err != nil {
result.Error = err.Error()
return result
}
rawResult := make([][]byte, len(cols))
dest := make([]any, len(cols))
for i := range rawResult {
dest[i] = &rawResult[i]
}
for rows.Next() {
err := rows.Scan(dest...)
if err != nil {
result.Error = err.Error()
return result
}
rowMap := make(map[string]string)
for i, colName := range cols {
if rawResult[i] == nil || colName == "" {
continue
}
if colName == "filename" {
rowMap["source"] = server.FormatParquetName(string(rawResult[i]))
continue
}
rowMap[colName] = string(rawResult[i])
}
result.Rows = append(result.Rows, rowMap)
}
if err = rows.Err(); err != nil {
result.Error = err.Error()
return result
}
if len(result.Rows) >= s.Settings.Limit {
result.LimitHit = true
}
result.Rows = removeDuplicateMaps(result.Rows)
result.Duration = time.Since(now)
return result
}
func removeDuplicateMaps(maps []map[string]string) []map[string]string {
seen := make(map[string]struct{})
result := []map[string]string{}
for _, m := range maps {
// Create a unique key for the map by concatenating its key-value pairs
var sb strings.Builder
keys := make([]string, 0, len(m))
for k := range m {
keys = append(keys, k)
}
slices.Sort(keys) // Sort keys to ensure consistent order
for _, k := range keys {
sb.WriteString(k)
sb.WriteString("=")
sb.WriteString(m[k])
sb.WriteString(";")
}
key := sb.String()
if _, exists := seen[key]; !exists {
seen[key] = struct{}{}
result = append(result, m)
}
}
return result
}
func buildSqlQuery(s *server.Server, queryText, column string, exactMatch bool, includeFolders []bool) string {
folders := s.Settings.Folders
includedFolders := []string{}
for i, f := range folders {
if i >= len(includeFolders) {
break
}
if includeFolders[i] {
includedFolders = append(includedFolders, f)
}
}
if len(includedFolders) == 0 {
return "error: no folders included"
}
// Step 1: Determine candidate columns to search
var candidateColumns []string
if column == "all" || column == "" {
// Use base columns if "all" or empty
candidateColumns = s.Settings.BaseColumns
} else {
// Otherwise, only search the given column
candidateColumns = []string{column}
}
// Step 2: Collect all available columns across dataleaks
allColumns := make([]string, 0)
seen := make(map[string]struct{})
for _, dataleak := range *s.Dataleaks {
if !isPathInFolders(dataleak.Path, includedFolders) {
continue
}
for _, col := range dataleak.Columns {
if _, ok := seen[col]; !ok {
seen[col] = struct{}{}
allColumns = append(allColumns, col)
}
}
}
// Step 3: Resolve which columns should actually be used in the WHERE clause
var columnsFiltered []string
if strings.EqualFold(column, "full_text") {
// "full_text" means search across all columns
columnsFiltered = allColumns
} else {
for _, candidate := range candidateColumns {
for _, available := range allColumns {
// Exact match (case-insensitive)
if strings.EqualFold(available, candidate) {
columnsFiltered = append(columnsFiltered, available)
continue
}
// Match columns ending with "_<candidate>"
if strings.HasSuffix(strings.ToLower(available), "_"+strings.ToLower(candidate)) {
columnsFiltered = append(columnsFiltered, available)
}
}
}
}
limit := strconv.Itoa(s.Settings.Limit)
from := getFromClause(s, includedFolders)
if len(columnsFiltered) == 0 {
return fmt.Sprintf("SELECT * FROM %s LIMIT %s", from, limit)
}
where := getWhereClause(queryText, columnsFiltered, exactMatch)
return fmt.Sprintf("SELECT * FROM %s WHERE %s LIMIT %s", from, where, limit)
}
func getWhereClause(queryText string, columns []string, exactMatch bool) string {
terms := strings.Fields(queryText)
var andClauses []string
for _, term := range terms {
var orClausesForTerm []string
termEscaped := strings.ReplaceAll(term, "'", "''")
startsWith := false
endsWith := false
if strings.HasPrefix(termEscaped, "^") {
startsWith = true
termEscaped = strings.TrimPrefix(termEscaped, "^")
}
if strings.HasSuffix(termEscaped, "$") {
endsWith = true
termEscaped = strings.TrimSuffix(termEscaped, "$")
}
termEscapedILike := strings.ReplaceAll(termEscaped, "_", "\\_")
termEscapedILike = strings.ReplaceAll(termEscapedILike, "%", "\\%")
for _, col := range columns {
if exactMatch || (startsWith && endsWith) {
orClausesForTerm = append(orClausesForTerm, fmt.Sprintf("\"%s\" ILIKE '%s' ESCAPE '\\'", col, strings.ToLower(termEscapedILike)))
} else if startsWith {
orClausesForTerm = append(orClausesForTerm, fmt.Sprintf("\"%s\" ILIKE '%s%%' ESCAPE '\\'", col, strings.ToLower(termEscapedILike)))
} else if endsWith {
orClausesForTerm = append(orClausesForTerm, fmt.Sprintf("\"%s\" ILIKE '%%%s' ESCAPE '\\'", col, strings.ToLower(termEscapedILike)))
} else {
orClausesForTerm = append(orClausesForTerm, fmt.Sprintf("\"%s\" ILIKE '%%%s%%' ESCAPE '\\'", col, strings.ToLower(termEscapedILike)))
}
}
andClauses = append(andClauses, "("+strings.Join(orClausesForTerm, " OR ")+")")
}
return strings.Join(andClauses, " AND ")
}
func getFromClause(s *server.Server, includedFolders []string) string {
parquets := []string{}
for _, dataleak := range *s.Dataleaks {
if !isPathInFolders(dataleak.Path, includedFolders) {
continue
}
parquets = append(parquets, "'"+dataleak.Path+"'")
}
return fmt.Sprintf("read_parquet([%s], union_by_name=true, filename=true)", strings.Join(parquets, ", "))
}
func GetDataleakSample(s server.Server, path string) ([][]string, error) {
rowsData := [][]string{}
// Use row_number() to get first 3 and last 3 rows
query := fmt.Sprintf(`
WITH numbered AS (
SELECT *, row_number() OVER () AS rn, count(*) OVER () AS total_rows
FROM read_parquet('%s')
)
SELECT * EXCLUDE (rn, total_rows)
FROM numbered
WHERE rn <= 3 OR rn > total_rows - 3
`, path)
rows, err := s.Duckdb.Query(query)
if err != nil {
return rowsData, err
}
defer rows.Close()
cols, err := rows.Columns()
if err != nil {
return rowsData, err
}
rowsData = append(rowsData, cols) // header
rawResult := make([][]byte, len(cols))
dest := make([]any, len(cols))
for i := range rawResult {
dest[i] = &rawResult[i]
}
for rows.Next() {
if err := rows.Scan(dest...); err != nil {
return rowsData, err
}
row := make([]string, len(cols))
for i := range cols {
if rawResult[i] == nil {
row[i] = ""
} else {
row[i] = string(rawResult[i])
}
}
rowsData = append(rowsData, row)
}
if err = rows.Err(); err != nil {
return rowsData, err
}
return rowsData, nil
}
func isPathInFolders(path string, folders []string) bool {
for _, folder := range folders {
rel, err := filepath.Rel(folder, path)
if err != nil {
continue
}
if !strings.HasPrefix(rel, "..") {
return true
}
}
return false
}