init
This commit is contained in:
107
leak-utils/parquet/format.go
Normal file
107
leak-utils/parquet/format.go
Normal file
@@ -0,0 +1,107 @@
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/anotherhadi/eleakxir/leak-utils/settings"
|
||||
)
|
||||
|
||||
// If there is no full_name but there is last_name and first_name, create full_name
|
||||
// If there is no full_name, no last_name or no first_name, but there is name, rename name to full_name
|
||||
func addFullname(operations []ColumnOperation) []ColumnOperation {
|
||||
hasFullName := false
|
||||
hasFirstName := false
|
||||
hasLastName := false
|
||||
hasName := false
|
||||
for _, op := range operations {
|
||||
if op.Action != "drop" {
|
||||
if op.NewName == "full_name" {
|
||||
hasFullName = true
|
||||
} else if op.NewName == "first_name" {
|
||||
hasFirstName = true
|
||||
} else if op.NewName == "last_name" {
|
||||
hasLastName = true
|
||||
} else if op.NewName == "name" {
|
||||
hasName = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if hasFullName {
|
||||
return operations
|
||||
}
|
||||
if hasFirstName && hasLastName {
|
||||
operations = append(operations, ColumnOperation{
|
||||
OriginalName: "first_name || ' ' || last_name",
|
||||
NewName: "full_name",
|
||||
Action: "rename",
|
||||
})
|
||||
fmt.Println(settings.Muted.Render("\nAdding new column 'full_name' as concatenation of 'first_name' and 'last_name'."))
|
||||
return operations
|
||||
}
|
||||
if hasName {
|
||||
for i, op := range operations {
|
||||
if op.NewName == "name" && op.Action != "drop" {
|
||||
operations[i].NewName = "full_name"
|
||||
fmt.Println(settings.Muted.Render("\nRenaming column 'name' to 'full_name'."))
|
||||
return operations
|
||||
}
|
||||
}
|
||||
}
|
||||
if hasFirstName {
|
||||
operations = append(operations, ColumnOperation{
|
||||
OriginalName: "first_name",
|
||||
NewName: "full_name",
|
||||
Action: "rename",
|
||||
})
|
||||
fmt.Println(settings.Muted.Render("\nAdding new column 'full_name' from 'first_name'."))
|
||||
return operations
|
||||
}
|
||||
if hasLastName {
|
||||
operations = append(operations, ColumnOperation{
|
||||
OriginalName: "last_name",
|
||||
NewName: "full_name",
|
||||
Action: "rename",
|
||||
})
|
||||
fmt.Println(settings.Muted.Render("\nAdding new column 'full_name' from 'last_name'."))
|
||||
return operations
|
||||
}
|
||||
|
||||
return operations
|
||||
}
|
||||
|
||||
// formatColumnName formats a column name to be SQL-compliant.
|
||||
func formatColumnName(columnName string) string {
|
||||
columnName = strings.TrimSpace(columnName)
|
||||
columnName = strings.ToLower(columnName)
|
||||
columnName = strings.Join(strings.Fields(columnName), "_")
|
||||
columnName = strings.ReplaceAll(columnName, "\"", "")
|
||||
columnName = strings.ReplaceAll(columnName, "'", "")
|
||||
columnName = strings.ReplaceAll(columnName, " ", "_")
|
||||
columnName = strings.ReplaceAll(columnName, "-", "_")
|
||||
// Only keep a-z, 0-9 and _
|
||||
var formatted strings.Builder
|
||||
for _, r := range columnName {
|
||||
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '_' {
|
||||
formatted.WriteRune(r)
|
||||
}
|
||||
}
|
||||
columnName = formatted.String()
|
||||
columnName = strings.TrimPrefix(columnName, "_")
|
||||
columnName = strings.TrimSuffix(columnName, "_")
|
||||
return columnName
|
||||
}
|
||||
|
||||
// formatColumns applies specific formatting rules to column operations.
|
||||
func formatColumns(operations []ColumnOperation) []ColumnOperation {
|
||||
formatedOperations := []ColumnOperation{}
|
||||
for _, op := range operations {
|
||||
if op.NewName == "phone" || strings.HasSuffix(op.NewName, "_phone") {
|
||||
op.OriginalName = "REGEXP_REPLACE(" + op.OriginalName + ", '[^0-9]', '')"
|
||||
} else if op.NewName == "email" || strings.HasSuffix(op.NewName, "_email") {
|
||||
op.OriginalName = "REGEXP_REPLACE(LOWER(TRIM(" + op.OriginalName + ")), '[^a-z0-9._@-]', '')"
|
||||
}
|
||||
formatedOperations = append(formatedOperations, op)
|
||||
}
|
||||
return formatedOperations
|
||||
}
|
||||
276
leak-utils/parquet/parquet.go
Normal file
276
leak-utils/parquet/parquet.go
Normal file
@@ -0,0 +1,276 @@
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/anotherhadi/eleakxir/leak-utils/settings"
|
||||
"github.com/charmbracelet/log"
|
||||
)
|
||||
|
||||
type Parquet struct {
|
||||
Filepath string
|
||||
Filename string
|
||||
Columns []string
|
||||
Sample [][]string
|
||||
NRows int64
|
||||
Compression string // Compression of the output file (e.g., "SNAPPY", "ZSTD", "NONE" or "")
|
||||
}
|
||||
|
||||
type ColumnOperation struct {
|
||||
OriginalName string
|
||||
NewName string
|
||||
Action string // "keep", "rename", "drop"
|
||||
}
|
||||
|
||||
func (parquet Parquet) PrintParquet() {
|
||||
fmt.Println(settings.Header.Render(parquet.Filename) + "\n")
|
||||
fmt.Println(settings.Accent.Render("File path:"), settings.Base.Render(parquet.Filepath))
|
||||
fmt.Println(settings.Accent.Render("Number of columns:"), settings.Base.Render(fmt.Sprintf("%d", len(parquet.Columns))))
|
||||
fmt.Println(settings.Accent.Render("Number of rows:"), settings.Base.Render(formatWithSpaces(parquet.NRows)))
|
||||
fmt.Println()
|
||||
fmt.Println(settings.Accent.Render(strings.Join(parquet.Columns, " | ")))
|
||||
for _, row := range parquet.Sample {
|
||||
fmt.Println(settings.Base.Render(strings.Join(row, " | ")))
|
||||
}
|
||||
}
|
||||
|
||||
func InfoParquet(lu settings.LeakUtils, inputFile string) error {
|
||||
parquet, err := GetParquet(lu.Db, inputFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
parquet.PrintParquet()
|
||||
return nil
|
||||
}
|
||||
|
||||
func CleanParquet(lu settings.LeakUtils, inputFile, outputFile string, skipLineFormating, deleteFirstRow, printQuery bool) error {
|
||||
input, err := GetParquet(lu.Db, inputFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
input.PrintParquet()
|
||||
columnOps := configureColumns(*input, skipLineFormating)
|
||||
output := Parquet{
|
||||
Filepath: outputFile,
|
||||
Compression: lu.Compression,
|
||||
}
|
||||
err = transformParquet(lu, *input, output, columnOps, deleteFirstRow, printQuery)
|
||||
return err
|
||||
}
|
||||
|
||||
func configureColumns(input Parquet, skipLineFormating bool) []ColumnOperation {
|
||||
reader := bufio.NewReader(os.Stdin)
|
||||
var operations []ColumnOperation
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println(settings.Base.Render("For each column, choose an action:"))
|
||||
fmt.Println(settings.Base.Render(" [k] Keep"))
|
||||
fmt.Println(settings.Base.Render(" [r] Rename"))
|
||||
fmt.Println(settings.Base.Render(" [d] Drop/Delete"))
|
||||
fmt.Println(settings.Base.Render(" [s] Suggested"))
|
||||
fmt.Println(settings.Base.Render(" [b] Go back"))
|
||||
fmt.Println()
|
||||
|
||||
for i := 0; i < len(input.Columns); i++ {
|
||||
col := input.Columns[i]
|
||||
suggestion := getSuggestion(col)
|
||||
|
||||
for {
|
||||
fmt.Println(settings.Muted.Render("\nColumn:"), settings.Accent.Render(col))
|
||||
if suggestion != "" {
|
||||
fmt.Println(settings.Alert.Render("Suggested action: Rename to '" + suggestion + "'"))
|
||||
}
|
||||
fmt.Print(settings.Base.Render("[k/r/d/s/b]: "))
|
||||
|
||||
input, err := reader.ReadString('\n')
|
||||
if err != nil {
|
||||
log.Printf("Error reading input: %v", err)
|
||||
continue
|
||||
}
|
||||
input = strings.TrimSpace(strings.ToLower(input))
|
||||
|
||||
op := ColumnOperation{
|
||||
OriginalName: col,
|
||||
NewName: col,
|
||||
Action: "keep",
|
||||
}
|
||||
|
||||
switch input {
|
||||
case "b", "back":
|
||||
if i > 0 {
|
||||
i -= 2
|
||||
if len(operations) > 0 {
|
||||
operations = operations[:len(operations)-1]
|
||||
}
|
||||
fmt.Println(settings.Muted.Render("Going back to the previous column..."))
|
||||
} else {
|
||||
fmt.Println(settings.Muted.Render("Already at the first column, cannot go back further."))
|
||||
continue
|
||||
}
|
||||
goto nextColumn
|
||||
|
||||
case "r", "rename":
|
||||
fmt.Print(settings.Base.Render("Enter new name: "))
|
||||
newName, err := reader.ReadString('\n')
|
||||
if err != nil {
|
||||
log.Printf("Error reading new name: %v", err)
|
||||
continue
|
||||
}
|
||||
newName = strings.TrimSpace(newName)
|
||||
if newName != "" {
|
||||
op.OriginalName = "\"" + op.OriginalName + "\""
|
||||
op.NewName = formatColumnName(newName)
|
||||
op.Action = "rename"
|
||||
operations = append(operations, op)
|
||||
goto nextColumn
|
||||
} else {
|
||||
fmt.Println(settings.Muted.Render("Invalid name, please try again."))
|
||||
continue
|
||||
}
|
||||
|
||||
case "s", "suggested":
|
||||
if suggestion != "" {
|
||||
op.OriginalName = "\"" + op.OriginalName + "\""
|
||||
op.NewName = formatColumnName(suggestion)
|
||||
op.Action = "rename"
|
||||
} else {
|
||||
fmt.Println(settings.Muted.Render("No valid suggestion available"))
|
||||
continue
|
||||
}
|
||||
operations = append(operations, op)
|
||||
goto nextColumn
|
||||
|
||||
case "d", "drop", "delete":
|
||||
op.Action = "drop"
|
||||
operations = append(operations, op)
|
||||
goto nextColumn
|
||||
|
||||
case "k", "keep", "":
|
||||
op.OriginalName = "\"" + op.OriginalName + "\""
|
||||
op.NewName = formatColumnName(op.NewName)
|
||||
op.Action = "rename"
|
||||
operations = append(operations, op)
|
||||
goto nextColumn
|
||||
|
||||
default:
|
||||
fmt.Println(settings.Muted.Render("Invalid choice, please enter [k/r/d/s/b]."))
|
||||
continue
|
||||
}
|
||||
}
|
||||
nextColumn:
|
||||
lastOp := operations[len(operations)-1]
|
||||
switch lastOp.Action {
|
||||
case "rename":
|
||||
if formatColumnName(lastOp.OriginalName) == lastOp.NewName {
|
||||
fmt.Printf(settings.Muted.Render("Keeping column '%s' as is.\n"), lastOp.OriginalName)
|
||||
} else {
|
||||
fmt.Printf(settings.Muted.Render("Renaming column '%s' to '%s'.\n"), lastOp.OriginalName, lastOp.NewName)
|
||||
}
|
||||
case "drop":
|
||||
fmt.Printf(settings.Muted.Render("Dropping column '%s'.\n"), lastOp.OriginalName)
|
||||
}
|
||||
}
|
||||
if !skipLineFormating {
|
||||
operations = formatColumns(operations)
|
||||
}
|
||||
operations = addFullname(operations)
|
||||
|
||||
return operations
|
||||
}
|
||||
|
||||
func transformParquet(lu settings.LeakUtils, input, output Parquet, operations []ColumnOperation, deleteFirstRow, printQuery bool) error {
|
||||
var selectClauses []string
|
||||
hasColumns := false
|
||||
|
||||
for _, op := range operations {
|
||||
if op.Action != "drop" {
|
||||
hasColumns = true
|
||||
if op.Action == "rename" {
|
||||
selectClauses = append(selectClauses, fmt.Sprintf("%s AS \"%s\"", op.OriginalName, op.NewName))
|
||||
} else {
|
||||
selectClauses = append(selectClauses, op.OriginalName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !hasColumns {
|
||||
return fmt.Errorf("no columns selected for output")
|
||||
}
|
||||
|
||||
selectClause := strings.Join(selectClauses, ", ")
|
||||
compression := ""
|
||||
if output.Compression != "" {
|
||||
compression = ", COMPRESSION '" + output.Compression + "'"
|
||||
}
|
||||
|
||||
columnsLength := []string{}
|
||||
for _, col := range input.Columns {
|
||||
columnsLength = append(columnsLength, "COALESCE(LENGTH(\""+col+"\"),0)")
|
||||
}
|
||||
allowedRowSize := 30 * len(input.Columns)
|
||||
offset := ""
|
||||
if deleteFirstRow {
|
||||
offset = "OFFSET 1"
|
||||
}
|
||||
|
||||
query := fmt.Sprintf(`
|
||||
COPY (
|
||||
SELECT %s
|
||||
FROM read_parquet('%s')
|
||||
WHERE (%s) < %d
|
||||
%s
|
||||
) TO '%s' (FORMAT PARQUET, ROW_GROUP_SIZE 200_000 %s)
|
||||
`, selectClause, input.Filepath, strings.Join(columnsLength, "+"), allowedRowSize, offset, output.Filepath, compression)
|
||||
|
||||
if printQuery {
|
||||
fmt.Println("Query:", query) // TODO: Remove tabs
|
||||
return nil
|
||||
}
|
||||
|
||||
fmt.Println(settings.Base.Render("\nTransforming and writing to output parquet..."))
|
||||
_, err := lu.Db.Exec(query)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to execute transformation: %w", err)
|
||||
}
|
||||
fmt.Println(settings.Base.Render("Transformation complete!\n"))
|
||||
|
||||
newParquet, err := GetParquet(lu.Db, output.Filepath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
newParquet.PrintParquet()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetParquet(db *sql.DB, inputFile string) (parquet *Parquet, err error) {
|
||||
parquet = &Parquet{}
|
||||
parquet.Filepath = inputFile
|
||||
|
||||
parquet.Columns, err = getColumns(db, inputFile)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
parquet.NRows, err = countRows(db, inputFile)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
parquet.Sample, err = getFirstNRows(db, inputFile, 6)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
n := strings.LastIndex(inputFile, "/")
|
||||
if n == -1 {
|
||||
parquet.Filename = inputFile
|
||||
} else {
|
||||
parquet.Filename = inputFile[n+1:]
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
81
leak-utils/parquet/suggestions.go
Normal file
81
leak-utils/parquet/suggestions.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"slices"
|
||||
)
|
||||
|
||||
func getSuggestion(col string) string {
|
||||
col = formatColumnName(col)
|
||||
knownNames := []string{
|
||||
"date",
|
||||
"phone",
|
||||
"username",
|
||||
"address",
|
||||
"email",
|
||||
"postal_code",
|
||||
"city",
|
||||
"country",
|
||||
"state",
|
||||
"age",
|
||||
"gender",
|
||||
"password",
|
||||
"password_hash",
|
||||
"full_name",
|
||||
"last_name",
|
||||
"name", // Will be renamed to full_name later
|
||||
"first_name",
|
||||
"birth_date",
|
||||
"url",
|
||||
"ip",
|
||||
}
|
||||
if slices.Contains(knownNames, col) {
|
||||
return col
|
||||
}
|
||||
if col == "user" {
|
||||
return "username"
|
||||
}
|
||||
if col == "login" {
|
||||
return "username"
|
||||
}
|
||||
if col == "sex" {
|
||||
return "gender"
|
||||
}
|
||||
if col == "ip_address" {
|
||||
return "ip"
|
||||
}
|
||||
if col == "password_hashed" {
|
||||
return "password_hash"
|
||||
}
|
||||
if col == "firstname" {
|
||||
return "first_name"
|
||||
}
|
||||
if col == "lastname" {
|
||||
return "last_name"
|
||||
}
|
||||
if col == "fullname" {
|
||||
return "full_name"
|
||||
}
|
||||
if col == "mail" {
|
||||
return "email"
|
||||
}
|
||||
if col == "zip" || col == "postalcode" || col == "zipcode" || col == "postal" || col == "zip_code" {
|
||||
return "postal_code"
|
||||
}
|
||||
if col == "street_address" {
|
||||
return "address"
|
||||
}
|
||||
if col == "hash" || col == "hashed_password" || col == "hash_password" {
|
||||
return "password_hash"
|
||||
}
|
||||
if col == "birthdate" || col == "dob" || col == "date_of_birth" {
|
||||
return "birth_date"
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// HINTS:
|
||||
// date: _date
|
||||
// url: _url, link
|
||||
// address: _address
|
||||
//
|
||||
105
leak-utils/parquet/utils.go
Normal file
105
leak-utils/parquet/utils.go
Normal file
@@ -0,0 +1,105 @@
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// getColumns retrieves the column names from the Parquet file.
|
||||
func getColumns(db *sql.DB, filepath string) ([]string, error) {
|
||||
// Create a view from the parquet file
|
||||
query := fmt.Sprintf("CREATE OR REPLACE VIEW parquet_view AS SELECT * FROM read_parquet('%s')", filepath)
|
||||
_, err := db.Exec(query)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create view: %w", err)
|
||||
}
|
||||
|
||||
// Get column information
|
||||
rows, err := db.Query("DESCRIBE parquet_view")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to describe view: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var columns []string
|
||||
for rows.Next() {
|
||||
var colName, colType, nullable, key, defaultVal, extra sql.NullString
|
||||
err := rows.Scan(&colName, &colType, &nullable, &key, &defaultVal, &extra)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to scan row: %w", err)
|
||||
}
|
||||
if colName.Valid {
|
||||
columns = append(columns, colName.String)
|
||||
}
|
||||
}
|
||||
|
||||
return columns, nil
|
||||
}
|
||||
|
||||
// getFirstNRows retrieves the first N rows from the Parquet file.
|
||||
func getFirstNRows(db *sql.DB, inputFile string, n int) ([][]string, error) {
|
||||
query := fmt.Sprintf("SELECT * FROM read_parquet('%s') LIMIT %d", inputFile, n)
|
||||
rows, err := db.Query(query)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to query parquet file: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
cols, err := rows.Columns()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get columns: %w", err)
|
||||
}
|
||||
|
||||
var results [][]string
|
||||
for rows.Next() {
|
||||
values := make([]sql.NullString, len(cols))
|
||||
valuePtrs := make([]any, len(cols))
|
||||
for i := range values {
|
||||
valuePtrs[i] = &values[i]
|
||||
}
|
||||
|
||||
err := rows.Scan(valuePtrs...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to scan row: %w", err)
|
||||
}
|
||||
|
||||
var row []string
|
||||
for _, val := range values {
|
||||
if val.Valid {
|
||||
row = append(row, val.String)
|
||||
} else {
|
||||
row = append(row, "NULL")
|
||||
}
|
||||
}
|
||||
results = append(results, row)
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
// countRows counts the number of rows in the Parquet file.
|
||||
func countRows(db *sql.DB, inputFile string) (int64, error) {
|
||||
var count int64
|
||||
err := db.QueryRow(fmt.Sprintf("SELECT COUNT(*) FROM read_parquet('%s')", inputFile)).Scan(&count)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to count rows: %w", err)
|
||||
}
|
||||
return count, nil
|
||||
}
|
||||
|
||||
// formatWithSpaces formats an integer with spaces as thousand separators.
|
||||
func formatWithSpaces(n int64) string {
|
||||
s := strconv.FormatInt(n, 10)
|
||||
|
||||
var b strings.Builder
|
||||
l := len(s)
|
||||
for i, c := range s {
|
||||
if i != 0 && (l-i)%3 == 0 {
|
||||
b.WriteRune(' ')
|
||||
}
|
||||
b.WriteRune(c)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
Reference in New Issue
Block a user