Data Scrape and Add Features (need attachments)

kapalczynski · ‎07-26-2024

I have some code that I am using to Scrape a Service and based on a where clause copy specific records to another dataset. It scrapes the service and creates json files for every 1000 records... It then reads the JSON files and uses the below to add them to the Service ..... This is working great.. but I need to modify it to move attachments as well... this is where I am confused ...

Because I am writing the feature to JSON and then using that to add features I am not sure how to include the attachments for each of those features in the JSON file.

Any thoughts very appreciated...

I am doing to add at the service level with 'edit_features' :

add_result = ports_layer.edit_features(adds = featureAddingAdd)

# SNIP 

portal_item = gis.content.get('73xxxxxxxxxxxxxxxxxxxxxxxxx5')
ports_layer = portal_item.tables[0]

class DataScraper():
    def __init__(self):
        # URL to map service you want to extract data from
        self.service_url = s123URL
    def getServiceProperties(self, url):
        URL = url
        PARAMS = {'f' : 'json'}
        r = requests.get(url = URL, params = PARAMS)
        service_props = r.json()
        return service_props
    def getLayerIds(self, url, query=None):
        URL = url + '/query'
        print(URL)
        PARAMS = {'f':'json', 'returnIdsOnly': True, 'where' : "Imported = 'No'"}

        if query:
            PARAMS['where'] = "ST = '{}'".format(query)
        r = requests.get(url = URL, params = PARAMS)
        data = r.json()
        
        return data['objectIds']
    def getLayerDataByIds(self, url, ids):
        # ids parameter should be a list of object ids
        URL = url + '/query'
        field = 'OBJECTID'
        value = ', '.join([str(i) for i in ids])
        PARAMS = {'f': 'json', 'where': '{} IN ({})'.format(field, value), 'returnIdsOnly': False, 'returnCountOnly': False,
                  'returnGeometry': True, 'outFields': '*'}
        r = requests.post(url=URL, data=PARAMS)
        layer_data = r.json()
        return layer_data
    def chunks(self, lst, n):
        # Yield successive n-sized chunks from list
        for i in range(0, len(lst), n):
            yield lst[i:i + n]
            
def scrapeData():
    try:
        service_props = ds.getServiceProperties(ds.service_url)
        max_record_count = service_props['maxRecordCount']
        layer_ids = ds.getLayerIds(ds.service_url)
        
        id_groups = list(ds.chunks(layer_ids, max_record_count))
        
        for i, id_group in enumerate(id_groups):
            print('  group {} of {}'.format(i+1, len(id_groups)))
            layer_data = ds.getLayerDataByIds(ds.service_url, id_group)
            level = str(i)
            outjsonpath = outputVariable + level + ".json"

            layer_data_final = layer_data
            print('Writing JSON file...')
            with open(outjsonpath, 'w') as out_json_file:
                json.dump(layer_data_final, out_json_file)
                
    except Exception:
        # Handle errors accordingly...this is generic
        tb = sys.exc_info()[2]
        tb_info = traceback.format_tb(tb)[0]
        pymsg = 'PYTHON ERRORS:\n\tTraceback info:\t{tb_info}\n\tError Info:\t{str(sys.exc_info()[1])}\n'
        msgs = 'ArcPy ERRORS:\t{arcpy.GetMessages(2)}\n'
        print(pymsg)
        print(msgs)

def addAAHData():
    try:
        for x in os.listdir(path):
            if x.startswith("output"):
                filetoImport = path + x
                print("Appending: " + x)
                f = open(filetoImport)
                data = json.load(f)
                featureAddingAdd = data['features']

                add_result = ports_layer.edit_features(adds = featureAddingAdd)
                
    except Exception:
        # Handle errors accordingly...this is generic
        tb = sys.exc_info()[2]
        tb_info = traceback.format_tb(tb)[0]
        pymsg = 'PYTHON ERRORS:\n\tTraceback info:\t{tb_info}\n\tError Info:\t{str(sys.exc_info()[1])}\n'
        msgs = 'ArcPy ERRORS:\t{arcpy.GetMessages(2)}\n'
        print(pymsg)
        print(msgs)

kapalczynski · ‎07-26-2024

Maybe read the JSON file and create a list of GlobalIDs and then query to find those in the attribute table? and then copy them over?

As I am looping through the json file to add the feature do something similar to read the attributes and add_attachments at the service level as well?

kapalczynski · ‎07-26-2024

Totally lost with this one... something like this was my first thought... BUT I need to read the JSON file and not the Feature Directly... as I might be scraping a large file and most only rad 1000 records at a time.

Or build an array of GlobalIDs from the JSON file being processed (can be many JSON files depending on how many features are being added)

        for x in os.listdir(path):
            if x.startswith("output"):
                filetoImport = path + x
                print("Appending: " + x)
                f = open(filetoImport)
                data = json.load(f)
                featureAddingAdd = data['features']

# _feat needs to be replaced with features in JSON
# somehow have to read the JSON instead of the feature in the example
                # Create attachment
                _att = {
                    'globalId': str(uuid.uuid4()),
                    'parentGlobalId': _feat.features[0].attributes['GlobalID'],  
                    'contentType': mimetypes.guess_type(_attach)[0],
                    'name': os.path.basename(_attach),
                    'data': _data64
                }


                #add_result = ports_layer.edit_features(adds = featureAddingAdd)
                add_result = ports_layer.edit_features(adds = featureAddingAdd, use_global_ids=True, attachments={'adds':[_att]})

EarlMedina · ‎07-26-2024

In simple terms, what is it you're trying to achieve? It seems like the goal is to copy certain features (along with their attachments) from one Feature Service to another?

I'm assuming because you say you are scraping, you do not own/manage the source data and have no way to export?

kapalczynski · ‎07-26-2024

First off thanks for your reply... very appreciated.

I have access to all the data...
This has to be automated and run itself....
I have a collection that is being done via S123 that has features and Attributes. These are Hosted and public
I need to copy (daily) the new records and attributes to another feature class that is NOT hosted and secured in our SDE.
I can do this now with via 'edit_features' with Rest API but dont know how to grab the attachments as well..

WORKFLOW:

Query (scrape and write to local JSON file) the hosted FC for records that have a STATUS field value of "null"
Copy those features to the NON hosted FC in SDE (currently using Rest API to do this at the service level 'edit_features')
Update the STATUS field value in the HOSTED FC for the records that were copied to 'Imported' -- so if run again they will not create duplicates

EarlMedina · ‎07-26-2024

Okay, thanks for clarifying.

I would say continuing on your path may be more trouble than it's worth, but doable. In order to do it, you would need some unique identifier in the data (besides GlobalID as I assume that is going to change). If you have that, then you can:

Iterate through your new features with AttachmentManager.search()
Download them or maybe use them directly via file stream
Match up the local records and add attachments with arcpy.

Personally, I think the better approach is like this:

Use SyncManager.create() to create a replica of the new feature data. You can query for new data using a layer query. Besides that, you would simply set the return_attachments parameter to True and set your data_format parameter to filegdb. So, with this approach you are essentially downloading the exact data you need in its complete form. As such, you may need to think about cleanup if the replicas are large in size.
Once you've got your replica, you simply use arcpy to append to your local data.

kapalczynski · ‎07-29-2024

@EarlMedina thanks for your input ... your solution sounds much better than my approach... and much easier to deal with Attributes...
Do you have or know of any examples showing this approach.... Cheers and thanks again

EarlMedina · ‎07-29-2024

I don't know of any specific examples, but you can get a quick overview here: Sync overview | ArcGIS API for Python

I forgot to mention that for this to work, you would need to turn on the sync capability.

kapalczynski · ‎07-31-2024

@EarlMedina

Done some work on this and have a few questions you or someone else might be able to answer...

I was able to

point to a HOSTED Feature which contains a Feature Class and a TABLE
verify its SYNC enabled and see its capabilities
I then search for the specific Portal ID for this Feature Service and then create replica (filegdb) and it downloads a .zip file to my local drive...

Results/Issues:

Although it does download a .Zip file with the Feature Layer and table then Feature Layer only has a subset of records and the Table is EMPTY
If I try and put in layerQueries and ReturnsAttachements in the Parameters below I get errors... So I dont know how to download the attachements.

NOTE: This is a HOSTED Feature with FeatureClass and Table

import arcpy
import arcgis
from arcgis.gis import GIS
import arcgis.features
import time

gis = GIS("https://URL/portalx", "username", "password")
currenttoken = gis._con.token
# UAT
url = 'https://URL/hosting/rest/services/Hosted/Highway/FeatureServer/'

aah_flc = arcgis.features.FeatureLayerCollection(url, gis)
type(aah_flc)
aahlayers = aah_flc.layers

for i in aahlayers:
    print(i)

# Are they SYNC enabled
aahlayerSync = aah_flc.properties.syncEnabled
print(aahlayerSync)

# What are the SYNC capabilties
aahlayersSyncCapabilities = aah_flc.properties.syncCapabilities
print(aahlayersSyncCapabilities)

# Build list of replicas
replica_list = aah_flc.replicas.get_list()
replicalistlength = len(replica_list)

if replicalistlength > 0:
    print(replicalistlength)
else:
    print("No replicas")

aahLayersCapabilities = aah_flc.properties.capabilities
print(aahLayersCapabilities)

# Search for specific HOSTED FEATURE LAYER AND TABLE
search_result = gis.content.search("xxxxxxxxxxxxxxxxxxxacbv410", "Feature Layer")
searchResultlength = len(search_result)

if searchResultlength > 0:
    # Create Feature Layer Collection
    aah_flc = arcgis.features.FeatureLayerCollection.fromitem(search_result[0])
    type(aah_flc)
    # Export Capabilities
    exportCapabilities = aah_flc.properties.capabilities
    print(exportCapabilities)
    print("")

    # Get Extents for potential Spatial Query
    extents = aah_flc.properties['fullExtent']
    extents_str = ",".join(format(x, "10.3f") for x in [extents['xmin'],extents['ymin'],extents['xmax'],extents['ymax']])
    print(extents_str)
    print("")

    replica1 = aah_flc.replicas.create(replica_name = 'JaysTEST',
                                      layers='0,1',
                                      #layerQueries = {"1":{"queryOption": "none", "useGeometry": false, "where": "IMPORTED = No" }},
                                      #syncDirection = "download",
                                      #geometry_filter=geom_filter,
                                      #returnsAttachments=True,
                                      #returnsAttachmentsDatabyURL=True,
                                      #sync_model="perLayer",
                                      sync_model='none', # none, perReplica
                                      target_type='server',
                                      data_format='filegdb', 
                                      out_path=r'C:\Users\Desktop\PROD\exports')
    print("replica1: ")
    print(replica1)
    print("")

    
else:
    print("No results")
    print("")

RESULTS:

True

{
"supportsRegisteringExistingData": true,
"supportsSyncDirectionControl": true,
"supportsPerLayerSync": true,
"supportsPerReplicaSync": false,
"supportsRollbackOnFailure": false,
"supportsAsync": true,
"supportsSyncModelNone": true,
"supportsAttachmentsSyncDirection": true
}

8

Create,Editing,Uploads,Query,Update,Sync,Extract
Create,Editing,Uploads,Query,Update,Sync,Extract

-9283683.186,4375456.993,-8391917.256,4776746.944

replica1:
C:\Users\Desktop\PROD\exports\_ags_data7A3409E7F57446A8A52204C4A36B93BE.zip

EarlMedina · ‎07-31-2024

For creating replicas, the REST API documentation is probably a bit more helpful as it's more complete: https://developers.arcgis.com/rest/services-reference/enterprise/create-replica/

It looks like for #1 the problem is your layerQuery. I think you may need to set queryOption to "all"

As for the attachments, I believe in recent versions they make you set the attachment sync direction as well.

Try:

return_attachments=True,
attachments_sync_direction="bidirectional"