Utilizing Python to Verify Data Sources Throughout a Migration Process

neomapper · ‎04-17-2025

Hello,

I'd like to share a script I created for our gas UPDM data migration as we transition to the new version. This script checks and validates the differences between the mapped field data values and generates an Excel file to highlight any discrepancies in case the data doesn't transfer correct.

Hope this helps!

Jordan

import arcpy
import pandas as pd

# Set input paths and parameters
layer2 = r" Your_data_path"
layer1 = r" Pilot_data_path"
unique_field = "assetid"  # Replace with the name of the unique field
output_excel = r"Your_folder_path/assembly_output.xlsx" #output path
matched_fields = {
    "COMMODITY":"your_field",
    "CONTINTOM":"your_field",
    "COVERTYPE":"your_field",
    "CPTRACEABILITY":"your_field",
    "CRITICALINDICATOR":"your_field",
    "DEPTH":"your_field",
    "DEPTHUNITS":"your_field",
    "DESIGNCLASS":"your_field",
    "DESIGNFACTOR":"your_field",
    "DESIGNPRESSURESTANDARD":"your_field",
    "DESIGNPRESSUREUNITS":"your_field",
    "DIAMETERUNITS":"your_field",
    "HEATNUMBER":"your_field",
    "INSERVICEDATE":"your_field",
    "INSTALLATIONMETHOD":"your_field",
    "INSTALLDATE":"your_field",
    "LEGACYPRESSURENETWORK":"your_field",
    "LENGTHSOURCE":"your_field",
    "LENGTHUNITS":"your_field",
    "LIFECYCLESTATUS":"your_field",
    "LOCATIONDESCRIPTION":"your_field",
    "MAINTBY":"your_field",
    "MANUFACTUREDATE":"your_field",
    "MANUFACTURER":"your_field",
    "MANUFACTURERLOTNO":"your_field",
    "MAOPDESIGN":"your_field",
    "MAOPRECORD":"your_field",
    "MAOPTEST":"your_field",
    "MAOPUNITS":"your_field",
    "MATERIAL":"your_field",
    "MEASUREDLENGTH":"your_field",
    "NOMINALDIAMETER":"your_field",
    "NOTES":"your_field",
    "ODORIZED":"your_field",
    "ORIGINALCOVERDEPTH":"your_field",
    "OWNEDBY":"your_field",
    "PERCENTSMYS":"your_field",
    "PINCHABLE":"your_field",
    "PRESSUREUNITS":"your_field",
    "PRIMARYCOATINGTYPE":"your_field",
    "PROJECTNUMBER":"your_field",
    "RETIREDDATE":"your_field",
    "SEAMTYPE":"your_field",
    "TEST_DATE":"your_field",
    "TEST_MEDIUM":"your_field",
    "TEST_PRESSURE":"your_field",
    "TRACERWIRE":"your_field",
#    "USR_GRADE":your_field,
    "USR_SDR":"your_field",
    "USR_STOCK_NUMBER":"your_field",
    "WARNINGTAPE":"your_field",
    "WORKORDERID":"your_field",
    "CLDRIVE":"your_field",
    "CLNEARST":"your_field",
    "INSTALLBY":"your_field",
    "TEST_PRESSURE_DURATION":"your_field",
    "FDR":"your_field",
    "WELDERNAME":"your_field"

}

# Create dictionaries to store field values
layer1_data = {}
layer2_data = {}

# Get unique values from the unique field
with arcpy.da.SearchCursor(layer1, [unique_field]) as cursor1:
    for row in cursor1:
        unique_value = row[0]
        if unique_value not in layer1_data:
            layer1_data[unique_value] = {}

with arcpy.da.SearchCursor(layer2, [unique_field]) as cursor2:
    for row in cursor2:
        unique_value = row[0]
        if unique_value not in layer2_data:
            layer2_data[unique_value] = {}

# Check if any rows in layer 1 have unique values also in layer 2
for unique_value in layer1_data:
    if unique_value in layer2_data:
        # Get the values for each mapped field
        with arcpy.da.SearchCursor(layer1, [unique_field] + list(matched_fields.keys())) as cursor1:
            for row1 in cursor1:
                if row1[0] == unique_value:
                    # Store values in layer1_data
                    for i, field in enumerate(matched_fields.keys()):
                        layer1_data[unique_value][field] = row1[i+1] # +1 because ObjectID is at index 0

        with arcpy.da.SearchCursor(layer2, [unique_field] + list(matched_fields.values())) as cursor2:
            for row2 in cursor2:
                if row2[0] == unique_value:
                    # Store values in layer2_data
                    for i, field in enumerate(matched_fields.values()):
                        layer2_data[unique_value][field] = row2[i+1] # +1 because ObjectID is at index 0

# Compare the values for each mapped field
mismatch_data = []
for unique_value in layer1_data:
    if unique_value in layer2_data:
        mismatch = False
        for layer1_field, layer2_field in matched_fields.items():
            if layer1_data[unique_value][layer1_field] != layer2_data[unique_value][layer2_field]:
                mismatch = True
                break  # Exit the loop if a mismatch is found

        if mismatch:
            row_data = {
                "unique_value": unique_value
            }

            # Add all field values for the matched unique_value
            for layer1_field, layer2_field in matched_fields.items():
                row_data[f"layer1_{layer1_field}"] = layer1_data[unique_value][layer1_field]
                row_data[f"layer2_{layer2_field}"] = layer2_data[unique_value][layer2_field]
            mismatch_data.append(row_data)

# Create a pandas DataFrame from the mismatch data
df = pd.DataFrame(mismatch_data)

# Write the DataFrame to an Excel file
df.to_excel(output_excel, index=False)
print(f"Mismatches written to: {output_excel}")

AyanPalit · ‎04-30-2025

@neomapper Thanks for sharing your script; Utility community members will find this useful. What's the UPDM version used for the script?

Ayan Palit | Principal Consultant Esri