This commit is contained in:
Hadi
2025-09-24 17:20:03 +02:00
commit b9fbed9a54
83 changed files with 6241 additions and 0 deletions

View File

@@ -0,0 +1,107 @@
package parquet
import (
"fmt"
"strings"
"github.com/anotherhadi/eleakxir/leak-utils/settings"
)
// If there is no full_name but there is last_name and first_name, create full_name
// If there is no full_name, no last_name or no first_name, but there is name, rename name to full_name
func addFullname(operations []ColumnOperation) []ColumnOperation {
hasFullName := false
hasFirstName := false
hasLastName := false
hasName := false
for _, op := range operations {
if op.Action != "drop" {
if op.NewName == "full_name" {
hasFullName = true
} else if op.NewName == "first_name" {
hasFirstName = true
} else if op.NewName == "last_name" {
hasLastName = true
} else if op.NewName == "name" {
hasName = true
}
}
}
if hasFullName {
return operations
}
if hasFirstName && hasLastName {
operations = append(operations, ColumnOperation{
OriginalName: "first_name || ' ' || last_name",
NewName: "full_name",
Action: "rename",
})
fmt.Println(settings.Muted.Render("\nAdding new column 'full_name' as concatenation of 'first_name' and 'last_name'."))
return operations
}
if hasName {
for i, op := range operations {
if op.NewName == "name" && op.Action != "drop" {
operations[i].NewName = "full_name"
fmt.Println(settings.Muted.Render("\nRenaming column 'name' to 'full_name'."))
return operations
}
}
}
if hasFirstName {
operations = append(operations, ColumnOperation{
OriginalName: "first_name",
NewName: "full_name",
Action: "rename",
})
fmt.Println(settings.Muted.Render("\nAdding new column 'full_name' from 'first_name'."))
return operations
}
if hasLastName {
operations = append(operations, ColumnOperation{
OriginalName: "last_name",
NewName: "full_name",
Action: "rename",
})
fmt.Println(settings.Muted.Render("\nAdding new column 'full_name' from 'last_name'."))
return operations
}
return operations
}
// formatColumnName formats a column name to be SQL-compliant.
func formatColumnName(columnName string) string {
columnName = strings.TrimSpace(columnName)
columnName = strings.ToLower(columnName)
columnName = strings.Join(strings.Fields(columnName), "_")
columnName = strings.ReplaceAll(columnName, "\"", "")
columnName = strings.ReplaceAll(columnName, "'", "")
columnName = strings.ReplaceAll(columnName, " ", "_")
columnName = strings.ReplaceAll(columnName, "-", "_")
// Only keep a-z, 0-9 and _
var formatted strings.Builder
for _, r := range columnName {
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '_' {
formatted.WriteRune(r)
}
}
columnName = formatted.String()
columnName = strings.TrimPrefix(columnName, "_")
columnName = strings.TrimSuffix(columnName, "_")
return columnName
}
// formatColumns applies specific formatting rules to column operations.
func formatColumns(operations []ColumnOperation) []ColumnOperation {
formatedOperations := []ColumnOperation{}
for _, op := range operations {
if op.NewName == "phone" || strings.HasSuffix(op.NewName, "_phone") {
op.OriginalName = "REGEXP_REPLACE(" + op.OriginalName + ", '[^0-9]', '')"
} else if op.NewName == "email" || strings.HasSuffix(op.NewName, "_email") {
op.OriginalName = "REGEXP_REPLACE(LOWER(TRIM(" + op.OriginalName + ")), '[^a-z0-9._@-]', '')"
}
formatedOperations = append(formatedOperations, op)
}
return formatedOperations
}

View File

@@ -0,0 +1,276 @@
package parquet
import (
"bufio"
"database/sql"
"fmt"
"os"
"strings"
"github.com/anotherhadi/eleakxir/leak-utils/settings"
"github.com/charmbracelet/log"
)
type Parquet struct {
Filepath string
Filename string
Columns []string
Sample [][]string
NRows int64
Compression string // Compression of the output file (e.g., "SNAPPY", "ZSTD", "NONE" or "")
}
type ColumnOperation struct {
OriginalName string
NewName string
Action string // "keep", "rename", "drop"
}
func (parquet Parquet) PrintParquet() {
fmt.Println(settings.Header.Render(parquet.Filename) + "\n")
fmt.Println(settings.Accent.Render("File path:"), settings.Base.Render(parquet.Filepath))
fmt.Println(settings.Accent.Render("Number of columns:"), settings.Base.Render(fmt.Sprintf("%d", len(parquet.Columns))))
fmt.Println(settings.Accent.Render("Number of rows:"), settings.Base.Render(formatWithSpaces(parquet.NRows)))
fmt.Println()
fmt.Println(settings.Accent.Render(strings.Join(parquet.Columns, " | ")))
for _, row := range parquet.Sample {
fmt.Println(settings.Base.Render(strings.Join(row, " | ")))
}
}
func InfoParquet(lu settings.LeakUtils, inputFile string) error {
parquet, err := GetParquet(lu.Db, inputFile)
if err != nil {
return err
}
parquet.PrintParquet()
return nil
}
func CleanParquet(lu settings.LeakUtils, inputFile, outputFile string, skipLineFormating, deleteFirstRow, printQuery bool) error {
input, err := GetParquet(lu.Db, inputFile)
if err != nil {
return err
}
input.PrintParquet()
columnOps := configureColumns(*input, skipLineFormating)
output := Parquet{
Filepath: outputFile,
Compression: lu.Compression,
}
err = transformParquet(lu, *input, output, columnOps, deleteFirstRow, printQuery)
return err
}
func configureColumns(input Parquet, skipLineFormating bool) []ColumnOperation {
reader := bufio.NewReader(os.Stdin)
var operations []ColumnOperation
fmt.Println()
fmt.Println(settings.Base.Render("For each column, choose an action:"))
fmt.Println(settings.Base.Render(" [k] Keep"))
fmt.Println(settings.Base.Render(" [r] Rename"))
fmt.Println(settings.Base.Render(" [d] Drop/Delete"))
fmt.Println(settings.Base.Render(" [s] Suggested"))
fmt.Println(settings.Base.Render(" [b] Go back"))
fmt.Println()
for i := 0; i < len(input.Columns); i++ {
col := input.Columns[i]
suggestion := getSuggestion(col)
for {
fmt.Println(settings.Muted.Render("\nColumn:"), settings.Accent.Render(col))
if suggestion != "" {
fmt.Println(settings.Alert.Render("Suggested action: Rename to '" + suggestion + "'"))
}
fmt.Print(settings.Base.Render("[k/r/d/s/b]: "))
input, err := reader.ReadString('\n')
if err != nil {
log.Printf("Error reading input: %v", err)
continue
}
input = strings.TrimSpace(strings.ToLower(input))
op := ColumnOperation{
OriginalName: col,
NewName: col,
Action: "keep",
}
switch input {
case "b", "back":
if i > 0 {
i -= 2
if len(operations) > 0 {
operations = operations[:len(operations)-1]
}
fmt.Println(settings.Muted.Render("Going back to the previous column..."))
} else {
fmt.Println(settings.Muted.Render("Already at the first column, cannot go back further."))
continue
}
goto nextColumn
case "r", "rename":
fmt.Print(settings.Base.Render("Enter new name: "))
newName, err := reader.ReadString('\n')
if err != nil {
log.Printf("Error reading new name: %v", err)
continue
}
newName = strings.TrimSpace(newName)
if newName != "" {
op.OriginalName = "\"" + op.OriginalName + "\""
op.NewName = formatColumnName(newName)
op.Action = "rename"
operations = append(operations, op)
goto nextColumn
} else {
fmt.Println(settings.Muted.Render("Invalid name, please try again."))
continue
}
case "s", "suggested":
if suggestion != "" {
op.OriginalName = "\"" + op.OriginalName + "\""
op.NewName = formatColumnName(suggestion)
op.Action = "rename"
} else {
fmt.Println(settings.Muted.Render("No valid suggestion available"))
continue
}
operations = append(operations, op)
goto nextColumn
case "d", "drop", "delete":
op.Action = "drop"
operations = append(operations, op)
goto nextColumn
case "k", "keep", "":
op.OriginalName = "\"" + op.OriginalName + "\""
op.NewName = formatColumnName(op.NewName)
op.Action = "rename"
operations = append(operations, op)
goto nextColumn
default:
fmt.Println(settings.Muted.Render("Invalid choice, please enter [k/r/d/s/b]."))
continue
}
}
nextColumn:
lastOp := operations[len(operations)-1]
switch lastOp.Action {
case "rename":
if formatColumnName(lastOp.OriginalName) == lastOp.NewName {
fmt.Printf(settings.Muted.Render("Keeping column '%s' as is.\n"), lastOp.OriginalName)
} else {
fmt.Printf(settings.Muted.Render("Renaming column '%s' to '%s'.\n"), lastOp.OriginalName, lastOp.NewName)
}
case "drop":
fmt.Printf(settings.Muted.Render("Dropping column '%s'.\n"), lastOp.OriginalName)
}
}
if !skipLineFormating {
operations = formatColumns(operations)
}
operations = addFullname(operations)
return operations
}
func transformParquet(lu settings.LeakUtils, input, output Parquet, operations []ColumnOperation, deleteFirstRow, printQuery bool) error {
var selectClauses []string
hasColumns := false
for _, op := range operations {
if op.Action != "drop" {
hasColumns = true
if op.Action == "rename" {
selectClauses = append(selectClauses, fmt.Sprintf("%s AS \"%s\"", op.OriginalName, op.NewName))
} else {
selectClauses = append(selectClauses, op.OriginalName)
}
}
}
if !hasColumns {
return fmt.Errorf("no columns selected for output")
}
selectClause := strings.Join(selectClauses, ", ")
compression := ""
if output.Compression != "" {
compression = ", COMPRESSION '" + output.Compression + "'"
}
columnsLength := []string{}
for _, col := range input.Columns {
columnsLength = append(columnsLength, "COALESCE(LENGTH(\""+col+"\"),0)")
}
allowedRowSize := 30 * len(input.Columns)
offset := ""
if deleteFirstRow {
offset = "OFFSET 1"
}
query := fmt.Sprintf(`
COPY (
SELECT %s
FROM read_parquet('%s')
WHERE (%s) < %d
%s
) TO '%s' (FORMAT PARQUET, ROW_GROUP_SIZE 200_000 %s)
`, selectClause, input.Filepath, strings.Join(columnsLength, "+"), allowedRowSize, offset, output.Filepath, compression)
if printQuery {
fmt.Println("Query:", query) // TODO: Remove tabs
return nil
}
fmt.Println(settings.Base.Render("\nTransforming and writing to output parquet..."))
_, err := lu.Db.Exec(query)
if err != nil {
return fmt.Errorf("failed to execute transformation: %w", err)
}
fmt.Println(settings.Base.Render("Transformation complete!\n"))
newParquet, err := GetParquet(lu.Db, output.Filepath)
if err != nil {
return err
}
newParquet.PrintParquet()
return nil
}
func GetParquet(db *sql.DB, inputFile string) (parquet *Parquet, err error) {
parquet = &Parquet{}
parquet.Filepath = inputFile
parquet.Columns, err = getColumns(db, inputFile)
if err != nil {
return
}
parquet.NRows, err = countRows(db, inputFile)
if err != nil {
return
}
parquet.Sample, err = getFirstNRows(db, inputFile, 6)
if err != nil {
return
}
n := strings.LastIndex(inputFile, "/")
if n == -1 {
parquet.Filename = inputFile
} else {
parquet.Filename = inputFile[n+1:]
}
return
}

View File

@@ -0,0 +1,81 @@
package parquet
import (
"slices"
)
func getSuggestion(col string) string {
col = formatColumnName(col)
knownNames := []string{
"date",
"phone",
"username",
"address",
"email",
"postal_code",
"city",
"country",
"state",
"age",
"gender",
"password",
"password_hash",
"full_name",
"last_name",
"name", // Will be renamed to full_name later
"first_name",
"birth_date",
"url",
"ip",
}
if slices.Contains(knownNames, col) {
return col
}
if col == "user" {
return "username"
}
if col == "login" {
return "username"
}
if col == "sex" {
return "gender"
}
if col == "ip_address" {
return "ip"
}
if col == "password_hashed" {
return "password_hash"
}
if col == "firstname" {
return "first_name"
}
if col == "lastname" {
return "last_name"
}
if col == "fullname" {
return "full_name"
}
if col == "mail" {
return "email"
}
if col == "zip" || col == "postalcode" || col == "zipcode" || col == "postal" || col == "zip_code" {
return "postal_code"
}
if col == "street_address" {
return "address"
}
if col == "hash" || col == "hashed_password" || col == "hash_password" {
return "password_hash"
}
if col == "birthdate" || col == "dob" || col == "date_of_birth" {
return "birth_date"
}
return ""
}
// HINTS:
// date: _date
// url: _url, link
// address: _address
//

105
leak-utils/parquet/utils.go Normal file
View File

@@ -0,0 +1,105 @@
package parquet
import (
"database/sql"
"fmt"
"strconv"
"strings"
)
// getColumns retrieves the column names from the Parquet file.
func getColumns(db *sql.DB, filepath string) ([]string, error) {
// Create a view from the parquet file
query := fmt.Sprintf("CREATE OR REPLACE VIEW parquet_view AS SELECT * FROM read_parquet('%s')", filepath)
_, err := db.Exec(query)
if err != nil {
return nil, fmt.Errorf("failed to create view: %w", err)
}
// Get column information
rows, err := db.Query("DESCRIBE parquet_view")
if err != nil {
return nil, fmt.Errorf("failed to describe view: %w", err)
}
defer rows.Close()
var columns []string
for rows.Next() {
var colName, colType, nullable, key, defaultVal, extra sql.NullString
err := rows.Scan(&colName, &colType, &nullable, &key, &defaultVal, &extra)
if err != nil {
return nil, fmt.Errorf("failed to scan row: %w", err)
}
if colName.Valid {
columns = append(columns, colName.String)
}
}
return columns, nil
}
// getFirstNRows retrieves the first N rows from the Parquet file.
func getFirstNRows(db *sql.DB, inputFile string, n int) ([][]string, error) {
query := fmt.Sprintf("SELECT * FROM read_parquet('%s') LIMIT %d", inputFile, n)
rows, err := db.Query(query)
if err != nil {
return nil, fmt.Errorf("failed to query parquet file: %w", err)
}
defer rows.Close()
cols, err := rows.Columns()
if err != nil {
return nil, fmt.Errorf("failed to get columns: %w", err)
}
var results [][]string
for rows.Next() {
values := make([]sql.NullString, len(cols))
valuePtrs := make([]any, len(cols))
for i := range values {
valuePtrs[i] = &values[i]
}
err := rows.Scan(valuePtrs...)
if err != nil {
return nil, fmt.Errorf("failed to scan row: %w", err)
}
var row []string
for _, val := range values {
if val.Valid {
row = append(row, val.String)
} else {
row = append(row, "NULL")
}
}
results = append(results, row)
}
return results, nil
}
// countRows counts the number of rows in the Parquet file.
func countRows(db *sql.DB, inputFile string) (int64, error) {
var count int64
err := db.QueryRow(fmt.Sprintf("SELECT COUNT(*) FROM read_parquet('%s')", inputFile)).Scan(&count)
if err != nil {
return 0, fmt.Errorf("failed to count rows: %w", err)
}
return count, nil
}
// formatWithSpaces formats an integer with spaces as thousand separators.
func formatWithSpaces(n int64) string {
s := strconv.FormatInt(n, 10)
var b strings.Builder
l := len(s)
for i, c := range s {
if i != 0 && (l-i)%3 == 0 {
b.WriteRune(' ')
}
b.WriteRune(c)
}
return b.String()
}