Oracle EGDB: Find junk values in all tables and columns (and log them in a table)

249
2
Jump to solution
12-08-2022 12:02 AM
Bud
by
Regular Contributor III
Oracle 18c/10.7.1 geodatabase:

I want to find junk values in all tables and columns (in a specific schema/owner/user):

  1. Loop through all tables
  2. Loop through each number and text column
  3. If a column has junk values, then add a record to a log table that summarizes the column's data integrity issue (i.e., insert a rolled-up record into the log table with a COUNT column, not a row for each problem value):
    • 0 (number)
    • " " (text; single space)
    • "  " (text; double space)
    • "0" (text)
    • "-" (text)
    • "NULL" (text; not a true null)
    • "<NULL>" (text; not a true null)

Bud_2-1670486244692.png

Thanks.

0 Kudos
1 Solution

Accepted Solutions
JohannesLindner
MVP Regular Contributor

 

def check_for_junk_values(layer_or_table):
    fields = [f.name for f in  arcpy.ListFields(layer_or_table)]
    junk_values = [0, " ", "  ", "-", "0", "NULL", "<NULL>", "<Null>"]
    data = [dict(zip(fields, row)) for row in arcpy.da.SearchCursor(layer_or_table, fields)]  # [{"Field1": 0, "Field2": 1}, {"Field1": 5, "Field2": 2}]
    counts = dict()
    for row in data:
        for field, value in row.items():
            if value in junk_values:
                key = (field, value)
                try:
                    counts[key] += 1
                except KeyError:
                    counts[key] = 1
    return counts



tables = ["TestPoints", "TestLines", "TestPolygons", "TestTable"]
# you can also use paths to your tables, or list them automatically with arcpy.ListTables() / arcpy.ListFeatureclasses()

log_table = arcpy.management.CreateTable("memory", "JunkLogTable")
arcpy.management.AddField(log_table, "TABLE_NAME", "TEXT")
arcpy.management.AddField(log_table, "COLUMN_NAME", "TEXT")
arcpy.management.AddField(log_table, "VAL_COUNT", "SHORT")
arcpy.management.AddField(log_table, "VALUE", "TEXT")

with arcpy.da.InsertCursor(log_table, ["TABLE_NAME", "COLUMN_NAME", "VALUE", "VAL_COUNT"]) as cursor:
    for table in tables:
        result = check_for_junk_values(table)
        for key, count in result.items():
            col = key[0]
            val = key[1].__repr__()  # __repr__() to get the quotes
            cursor.insertRow([table, col, val, count])

 

 

JohannesLindner_0-1670488554407.png

 


Have a great day!
Johannes

View solution in original post

2 Replies
Bud
by
Regular Contributor III

It's possible to do this with PL/SQL.

--DROP TABLE incorrect_value_results;
--CREATE TABLE incorrect_value_results (id NUMBER, table_name VARCHAR2(30), column_name VARCHAR2(30), val_count NUMBER, value varchar2(30));
TRUNCATE TABLE incorrect_value_results;

DECLARE
   l_count NUMBER;
   l_inv_char_str VARCHAR2(2000);
   TYPE invalid_char_val_rec IS RECORD(
      cnt      NUMBER,
      inv_char VARCHAR2(20)
   );
   TYPE invalid_char_val_tab IS TABLE OF invalid_char_val_rec
      INDEX BY PLS_INTEGER;
   l_inv_char_vals invalid_char_val_tab;
   l_index NUMBER;
BEGIN
   l_index := 0;
   -- Loop through each table in the schema
   FOR i IN (SELECT table_name
               FROM user_tables) 
   LOOP
      -- Loop through each relevant column for this table
      -- Exclude column EVT_FROM
      FOR j IN (SELECT column_name, data_type
                  FROM user_tab_cols
                 WHERE table_name = i.table_name
                   AND column_name  NOT IN ('EVT_FROM','ANGLE','UND','OFFSET')
                   AND table_name   NOT LIKE 'A_%'
                   AND table_name   NOT LIKE 'SDE%'
                   AND table_name   NOT LIKE '%ANNO%'
                   AND table_name   NOT IN ('INCORRECT_VALUE_RESULTS','NUMBERS')
                   AND data_type    IN ('VARCHAR2', 'CHAR', 'NCHAR', 'NVARCHAR2', 'NUMBER')) 
      LOOP
         IF j.data_type IN ('VARCHAR2', 'CHAR', 'NCHAR', 'NVARCHAR2') THEN
            EXECUTE IMMEDIATE 
               'SELECT COUNT(1), '||j.column_name||
                ' FROM '||i.table_name|| 
                ' WHERE UPPER('||j.column_name||') IN('' '', ''  '', ''0'', ''-'', ''NULL'', ''<NULL>'' )
                  GROUP BY '||j.column_name
                 BULK COLLECT INTO l_inv_char_vals;
            
         ELSIF j.data_type = 'NUMBER' THEN
            EXECUTE IMMEDIATE 
               'SELECT COUNT(1), '||j.column_name||
                ' FROM '||i.table_name|| 
                ' WHERE UPPER('||j.column_name||') <= 0
                  GROUP BY '||j.column_name
                 BULK COLLECT INTO l_inv_char_vals;
         END IF;     
         -- If there are results then log them
         l_index := l_index + 1;
         FORALL k IN 1..l_inv_char_vals.COUNT
            INSERT INTO incorrect_value_results (id, table_name, column_name, val_count, value)
            VALUES (l_index,
                    i.table_name, 
                    j.column_name, 
                    l_inv_char_vals(k).cnt,
                    l_inv_char_vals(k).inv_char);
      END LOOP;
   END LOOP;
END;
/
COMMIT;

Pros:

  • The PL/SQL script's performance is very good. It scans millions of records in less than 10 seconds. And it might be possible to optimize it even further.

Cons: 

  • The code is somewhat lengthy. It might be simpler to do it with ArcPy.

I don't need the solution to work for all geodatabase types; I just need to use it on my Oracle enterprise geodatabase. But an ArcPy solution would be more beneficial to a wider audience.

0 Kudos
JohannesLindner
MVP Regular Contributor

 

def check_for_junk_values(layer_or_table):
    fields = [f.name for f in  arcpy.ListFields(layer_or_table)]
    junk_values = [0, " ", "  ", "-", "0", "NULL", "<NULL>", "<Null>"]
    data = [dict(zip(fields, row)) for row in arcpy.da.SearchCursor(layer_or_table, fields)]  # [{"Field1": 0, "Field2": 1}, {"Field1": 5, "Field2": 2}]
    counts = dict()
    for row in data:
        for field, value in row.items():
            if value in junk_values:
                key = (field, value)
                try:
                    counts[key] += 1
                except KeyError:
                    counts[key] = 1
    return counts



tables = ["TestPoints", "TestLines", "TestPolygons", "TestTable"]
# you can also use paths to your tables, or list them automatically with arcpy.ListTables() / arcpy.ListFeatureclasses()

log_table = arcpy.management.CreateTable("memory", "JunkLogTable")
arcpy.management.AddField(log_table, "TABLE_NAME", "TEXT")
arcpy.management.AddField(log_table, "COLUMN_NAME", "TEXT")
arcpy.management.AddField(log_table, "VAL_COUNT", "SHORT")
arcpy.management.AddField(log_table, "VALUE", "TEXT")

with arcpy.da.InsertCursor(log_table, ["TABLE_NAME", "COLUMN_NAME", "VALUE", "VAL_COUNT"]) as cursor:
    for table in tables:
        result = check_for_junk_values(table)
        for key, count in result.items():
            col = key[0]
            val = key[1].__repr__()  # __repr__() to get the quotes
            cursor.insertRow([table, col, val, count])

 

 

JohannesLindner_0-1670488554407.png

 


Have a great day!
Johannes