Python search all folders for aprx apart from those folders containing sub-string

Username1 · ‎01-24-2025

Hi All,

I'm new to python and have created the code below after much googling. I'm trying to search through all subfolders within my directory and find all aprx's then list all of the features held within the aprx. My code achieves this but I'd like to make it more efficient by skipping over sub-folders that contain a certain sub-string.

For example I'd like to be able to remove all folders containing the string "myfolder" if the directory contained the following folders:

myfolder_1

myfolder2

testfolder

So my goal would be to only search for aprx's in the folder "testfolder".

My Code:

extensions = ('.aprx')#extention to search
exclude_directories = set(['Archived','#Archived','Archive','archived','archive','#archived','#archive','ss','SS']) #directory (only names) want to exclude
for dname, dirs, files in os.walk(r'C:\Users\MyDirectory'):
dirs[:] = [d for d in dirs if d not in exclude_directories] # exclude directory if in exclude list
for fname in files:
if(fname.lower().endswith(extensions)): #check for extension
fpath = os.path.join(dname, fname) #this generates full directory path for file
aprx = arcpy.mp.ArcGISProject(fpath)
for m in aprx.listMaps():
for lyr in m.listLayers():
if lyr.isBroken:
print("(BROKEN) " + lyr.name + " - " + m.name)
else:
if lyr.supports("DATASOURCE"):
print ("Map Layer Name: " + lyr.name + "| SZ_gis_dataname: " + lyr.dataSource[211:])

Marshal · ‎01-24-2025

Looks like you were essentially there. This should satisfy your goal, along with some minor refactoring. Hope it helps!

import os
import arcpy

# Extension to search
extensions = ('.aprx')

# Directories to exclude
exclude_directories = set(['Archived', '#Archived', 'Archive', 'archived', 'archive', '#archived', '#archive', 'ss', 'SS'])

# Walk through the directory
for dname, dirs, files in os.walk(r'C:\Users\MyDirectory'):
    dirs[:] = [d for d in dirs if d not in exclude_directories and 'myfolder' not in d.lower()]  # Exclude directories
    for fname in files:
        if fname.lower().endswith(extensions):  # Check for the .aprx extension
            fpath = os.path.join(dname, fname)  # Full path to the .aprx file
            try:
                aprx = arcpy.mp.ArcGISProject(fpath)  # Open the ArcGIS Project
                for m in aprx.listMaps():  # Get aprx maps
                    for lyr in m.listLayers():  # Get map layers
                        if lyr.isBroken:  # Check if layer is broken
                            print(f"(BROKEN) {lyr.name} - {m.name}")
                        else:
                            if lyr.supports("DATASOURCE"):
                                print(f"Map Layer Name: {lyr.name} | SZ_gis_dataname: {lyr.dataSource}")
            except Exception as e:
                print(f"Error processing file {fpath}: {e}")

RPGIS · ‎01-24-2025

Hi @Username1 ,

For future reference please upload your code using clicking on the expand toolbar

and inserting your code using

Regarding you code, try this.

exclude_directories = ['Archived','#Archived','Archive','archived','archive','#archived','#archive','ss','SS'] #directory (only names) want to exclude
RemoveFilesByName = 'myfolderex'

WorkingFolder = r'C:\Users\MyDirectory'
Projects = []
for root, directory, filenames in os.walk(WorkingFolder):
	for filename in filenames:
		dirname = os.path.split( directory )
		if dirname not in exclude_directories and all(['.aprx' in filename , RemoveFilesByName not in filename ] ) :
			Projects.append( os.path.join(root, filename) )
if len( Projects ) > 0:
	for project in Projects:
		project = arcpy.mp.ArcGISProject(project)
		maplayers = [ lyr for lyr in m.listLayers() for m in project.listMaps() if any([ lyr.supports("DATASOURCE") , lyr.isBroken ])]
		for layer in maplayers:
			if lyr.isBroken: print( f'(BROKEN) {lyr.name} - {m.name}' )
			else: print( f'Map Layer Name: {lyr.name} | SZ_gis_dataname: {lyr.dataSource[211:])}' )

TonyAlmeida · ‎01-24-2025

I use something like this.

import os
import arcpy

# File extensions to look for
extensions = ('.aprx',)

# Excludes folder
exclude_substrings = ['Folder']

def list_aprx_files(directory):
    # Find all .aprx files in the directory, skipping excluded subdirectories
    aprx_files = []
    for root, dirs, files in os.walk(directory):
        # Remove directories we don't want to search in
        dirs[:] = [d for d in dirs if not any(sub in d for sub in exclude_substrings)]
        for file in files:
            if file.lower().endswith(extensions):
                aprx_files.append(os.path.join(root, file))
    return aprx_files

def process_layer(layer, map_name):
    # Check if the layer is broken or print its data source if available
    if layer.isBroken:
        print(f"Broken Layer: {layer.name} | Map: {map_name}")
    elif layer.supports("DATASOURCE"):
        try:
            print(f"Layer: {layer.name} | Map: {map_name} | Data Source: {layer.dataSource}")
        except Exception as e:
            print(f"Couldn't get data source for {layer.name}: {e}")
    else:
        print(f"Layer: {layer.name} doesn't support data sources.")

def process_aprx_file(aprx_path):
    # Open the project file and process each layer in every map
    try:
        print(f"Opening project: {aprx_path}")
        aprx = arcpy.mp.ArcGISProject(aprx_path)
        for map_obj in aprx.listMaps():
            for layer in map_obj.listLayers():
                process_layer(layer, map_obj.name)
    except Exception as e:
        print(f"Failed to process {aprx_path}: {e}")

def main(directory):
    # Main script to go through all .aprx files in the folder
    aprx_files = list_aprx_files(directory)
    if not aprx_files:
        print(f"No project files (.aprx) found in {directory}.")
        return

    for aprx_file in aprx_files:
        process_aprx_file(aprx_file)

# Change this path to the folder you want to scan
main(r'O:\***\***\***')

Username1 · ‎01-29-2025

Thanks for this, defining each step isn't something I'd considered and not an approach I'd come accross.

The final step of my code is to write out the findings as a spreadsheet. I've been trying to get it to work with your code but haven't managed it. I assume that is would be possible to get the results and save them out as a spreadsheet?

For reference my code looks like this:

                  feature_service.append((m.name,
                                                    lyr.name,
                                                    lyr.dataSource[211:]))
feat_service_cols = ['Map Name','Map Layer Name','SZ_gis_dataname']
 
feat_service_result = pd.DataFrame(feature_service, columns=feat_service_cols)
#print (feat_service_result)
 
#Turn dictonaries into data frames
df_feat_service_result = pd.DataFrame.from_dict(feat_service_result)
#print (df_feat_service_result)
 
#write to excel
out_folder = (r'C:\MyOutputFolder\') #folder for excel
check_folder = os.path.isdir(out_folder)#does the folder exist if not make it
 
if not check_folder:
    os.makedirs(out_folder) #make output folder if missing
ts = datetime.now().strftime('%Y%m%d%H%M%S')
path=os.path.join(out_folder, "Report_"+ datetime.today().strftime('%Y.%m.%d')+'.xlsx') #set path for excel output
 
with pd.ExcelWriter(path) as writer:  
    df_feat_service_result.to_excel(writer, sheet_name='feat_service_result')
print (f"Excel outputs can be found in {path}")

Marshal · ‎01-29-2025

You can use the ArcGIS API for Python to read into Pandas dataframe, after which you could simply use .to_excel() method on the dataframe. It is unclear what type of data source your feature_service variable is, but I suspect one of the below would work.

Feature Layer to dataframe

https://developers.arcgis.com/python/latest/api-reference/arcgis.features.toc.html#arcgis.features.G...

Feature Class to dataframe

https://developers.arcgis.com/python/latest/api-reference/arcgis.features.toc.html#arcgis.features.G...

TonyAlmeida · ‎02-03-2025

Here is a function I use once a year to help me clean up data, and it allows me to create a .xlsx and share it with my peers.

import arcpy
import pandas as pd

def export_sde_feature_datasets(sde_connection, output_excel):
    """ 
    Lists feature datasets and feature classes in an SDE database 
    and exports them to an Excel file. 
    """

    sde_connection = "" #Connection to SDE database
    # Set the workspace to the SDE connection
    arcpy.env.workspace = sde_connection

    # Empty list to store the results
    data = []

    # List of feature datasets in the SDE database
    feature_datasets = arcpy.ListDatasets(feature_type="feature") or []

    if not feature_datasets:
        print("No feature datasets found in the SDE database.")
    else:
        for dataset in feature_datasets:
            data.append([dataset, ""])  # Append dataset as a header

            # Get all feature classes within the dataset
            dataset_path = f"{sde_connection}\\{dataset}"
            arcpy.env.workspace = dataset_path
            feature_classes = arcpy.ListFeatureClasses() or []

            if not feature_classes:
                data.append(["", "No feature classes found"])
            else:
                for fc in feature_classes:
                    data.append(["", fc])

    # Process standalone feature classes (not inside a dataset)
    arcpy.env.workspace = sde_connection
    standalone_feature_classes = arcpy.ListFeatureClasses() or []

    data.append(["Standalone Feature Classes", ""])
    
    if not standalone_feature_classes:
        data.append(["", "No standalone feature classes found"])
    else:
        for fc in standalone_feature_classes:
            data.append(["", fc])

    # Convert list to DataFrame and export to Excel
    df = pd.DataFrame(data, columns=["Feature Dataset", "Feature Class"])
    df.to_excel(output_excel, index=False)

    print(f"Feature datasets and feature classes have been exported to {output_excel}.")

# Example usage
sde_connection = r"C:\Users\***\AppData\Roaming\Esri\ArcGISPro\Favorites\***.sde"
output_excel = r"C:\temp\SDEFeatureDatasetFeatureClassList.xlsx"

export_sde_feature_datasets(sde_connection, output_excel)