init
This commit is contained in:
173
leak-utils/misc/csv.go
Normal file
173
leak-utils/misc/csv.go
Normal file
@@ -0,0 +1,173 @@
|
||||
package misc
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/anotherhadi/eleakxir/leak-utils/settings"
|
||||
"github.com/charmbracelet/log"
|
||||
)
|
||||
|
||||
func CsvToParquet(lu settings.LeakUtils, inputFile string, outputFile string, strict bool) error {
|
||||
hasHeader, err := csvHasHeader(inputFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
header := "true"
|
||||
if !hasHeader {
|
||||
header = "false"
|
||||
}
|
||||
strictMode := "true"
|
||||
if !strict {
|
||||
strictMode = "false"
|
||||
}
|
||||
|
||||
delimiter := getDelimiter(inputFile)
|
||||
|
||||
query := fmt.Sprintf(`CREATE TABLE my_table AS FROM read_csv_auto('%s', HEADER=%s, delim='%s', ignore_errors=true, all_varchar=true, null_padding=true, strict_mode=%s);
|
||||
COPY my_table TO '%s' (FORMAT 'parquet', COMPRESSION '%s', ROW_GROUP_SIZE 200_000);`,
|
||||
inputFile, header, delimiter, strictMode, outputFile, lu.Compression)
|
||||
|
||||
if lu.Debug {
|
||||
log.Info("Detected delimiter", "delimiter", delimiter)
|
||||
log.Info("CSV header detection", "hasHeader", hasHeader)
|
||||
log.Info("Executing query", "query", query)
|
||||
}
|
||||
|
||||
_, err = lu.Db.Exec(query)
|
||||
|
||||
if lu.Debug {
|
||||
log.Info("Finished executing query")
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func getDelimiter(inputFile string) string {
|
||||
lines, err := getNLine(inputFile, 10, 0)
|
||||
if err != nil {
|
||||
log.Warn("Failed to read CSV file to determine delimiter, defaulting to comma", "error", err)
|
||||
return ","
|
||||
}
|
||||
|
||||
delimiterCounts := map[string]int{
|
||||
",": 0,
|
||||
";": 0,
|
||||
"\t": 0,
|
||||
"|": 0,
|
||||
":": 0,
|
||||
}
|
||||
|
||||
for _, line := range lines {
|
||||
for d := range delimiterCounts {
|
||||
delimiterCounts[d] += strings.Count(line, d)
|
||||
}
|
||||
}
|
||||
|
||||
maxCount := 0
|
||||
delimiter := ","
|
||||
|
||||
for d, count := range delimiterCounts {
|
||||
if count > maxCount {
|
||||
maxCount = count
|
||||
delimiter = d
|
||||
}
|
||||
}
|
||||
|
||||
return delimiter
|
||||
}
|
||||
|
||||
func csvHasHeader(inputFile string) (hasHeader bool, err error) {
|
||||
firstRow, err := getFirstRowCsv(inputFile)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
for i, col := range firstRow {
|
||||
col = strings.ReplaceAll(col, "\"", "")
|
||||
col = strings.ReplaceAll(col, " ", "")
|
||||
col = strings.ReplaceAll(col, "-", "")
|
||||
col = strings.ReplaceAll(col, "_", "")
|
||||
col = strings.ReplaceAll(col, ".", "")
|
||||
firstRow[i] = strings.ToLower(strings.TrimSpace(col))
|
||||
}
|
||||
knownHeaders := []string{"email", "password", "username", "phone", "lastname", "firstname"}
|
||||
for _, knownHeader := range knownHeaders {
|
||||
if slices.Contains(firstRow, knownHeader) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func getNLine(inputFile string, n, offset int) (lines []string, err error) {
|
||||
if n <= 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if offset < 0 {
|
||||
offset = 0
|
||||
}
|
||||
|
||||
file, err := os.Open(inputFile)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
currentLine := 0
|
||||
|
||||
for scanner.Scan() {
|
||||
currentLine++
|
||||
if currentLine <= offset {
|
||||
continue
|
||||
}
|
||||
|
||||
lines = append(lines, scanner.Text())
|
||||
if len(lines) >= n {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil && err != io.EOF {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return lines, nil
|
||||
}
|
||||
|
||||
func getFirstRowCsv(inputFile string) (row []string, err error) {
|
||||
rows, err := getFirstNRowsCsv(inputFile, 1)
|
||||
if len(rows) == 0 {
|
||||
return nil, fmt.Errorf("no rows found in CSV")
|
||||
}
|
||||
return rows[0], err
|
||||
}
|
||||
|
||||
func getFirstNRowsCsv(inputFile string, n int) (rows [][]string, err error) {
|
||||
f, err := os.Open(inputFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open file: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
reader := csv.NewReader(f)
|
||||
|
||||
for i := 0; i < n; i++ {
|
||||
row, err := reader.Read()
|
||||
if err != nil {
|
||||
if err.Error() == "EOF" {
|
||||
break
|
||||
}
|
||||
return nil, fmt.Errorf("failed to read CSV: %w", err)
|
||||
}
|
||||
rows = append(rows, row)
|
||||
}
|
||||
|
||||
return rows, nil
|
||||
}
|
||||
31
leak-utils/misc/misc.go
Normal file
31
leak-utils/misc/misc.go
Normal file
@@ -0,0 +1,31 @@
|
||||
package misc
|
||||
|
||||
import (
|
||||
"io"
|
||||
"os"
|
||||
|
||||
"github.com/anotherhadi/eleakxir/leak-utils/settings"
|
||||
)
|
||||
|
||||
func MergeFiles(lu settings.LeakUtils, outputFile string, inputFiles ...string) error {
|
||||
out, err := os.Create(outputFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
for _, inputFile := range inputFiles {
|
||||
file, err := os.Open(inputFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
_, err = io.Copy(out, file)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
67
leak-utils/misc/ulp.go
Normal file
67
leak-utils/misc/ulp.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package misc
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/anotherhadi/eleakxir/leak-utils/settings"
|
||||
)
|
||||
|
||||
func RemoveUrlSchemeFromUlp(lu settings.LeakUtils, inputFile string) error {
|
||||
file, err := os.Open(inputFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
outputFile := inputFile + ".clean"
|
||||
out, err := os.Create(outputFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
reader := bufio.NewReader(file)
|
||||
writer := bufio.NewWriter(out)
|
||||
|
||||
for {
|
||||
line, err := reader.ReadString('\n')
|
||||
if err != nil && err != io.EOF {
|
||||
return err
|
||||
}
|
||||
|
||||
firstColumn := strings.Index(line, ":")
|
||||
firstScheme := strings.Index(line, "://")
|
||||
if firstScheme != -1 && firstColumn == firstScheme {
|
||||
line = line[firstScheme+3:]
|
||||
}
|
||||
|
||||
_, werr := writer.WriteString(line)
|
||||
if werr != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
err = writer.Flush()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = os.Remove(inputFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = os.Rename(outputFile, inputFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user