Extract Data from Open Data

kapalczynski · ‎03-07-2023

I have this URL: https://geohub-vadeq.hub.arcgis.com/datasets/57759688e4944bb987add68c4f0c5ada_104/explore?location=3...

or

https://geohub-vadeq.hub.arcgis.com/datasets/57759688e4944bb987add68c4f0c5ada_104/about

I want to use python/arcpy and download all its data. If I use the RestEndpoint there is a limit of 1000 records. I cant seem to get all 50k.

Any ideas on how I can programmatically ? Looking to get a GeoJSON or JSON and write it to a JSON file locally...

AaronCole1 · ‎03-07-2023

I do this kind of thing all the time! Love data mining. You have to group your requests in batches of whatever the max number of records that will be returned is, in this case 1000. Here's a code snippet I modified from something I use regularly. You can set it up to run on task to be fully automated. You'll have to put in your desired output paths and it should run.

import arcpy
import requests
import json
import os
import sys
import traceback
from datetime import date

arcpy.env.overwriteOutput = True

class DataScraper():
    def __init__(self):
        # URL to map service you want to extract data from
        self.service_url = 'https://apps.deq.virginia.gov/arcgis/rest/services/public/EDMA/MapServer/104'

    def getServiceProperties(self, url):
        URL = url
        PARAMS = {'f' : 'json'}
        r = requests.get(url = URL, params = PARAMS)
        service_props = r.json()
        return service_props


    def getLayerIds(self, url, query=None):
        URL = url + '/query'
        PARAMS = {'f':'json', 'returnIdsOnly': True, 'where' : '1=1'}
        if query:
            PARAMS['where'] = "ST = '{}'".format(query)
        r = requests.get(url = URL, params = PARAMS)
        data = r.json()
        return data['objectIds']


    def getLayerDataByIds(self, url, ids):
        # ids parameter should be a list of object ids
        URL = url + '/query'
        field = 'OBJECTID'
        value = ', '.join([str(i) for i in ids])
        PARAMS = {'f': 'json', 'where': '{} IN ({})'.format(field, value), 'returnIdsOnly': False, 'returnCountOnly': False, 'returnGeometry': True,
                  'outFields': '*'}
        r = requests.post(url=URL, data=PARAMS)
        layer_data = r.json()
        return layer_data

    # UTILITIES
    def chunks(self, lst, n):
        # Yield successive n-sized chunks from list
        for i in range(0, len(lst), n):
            yield lst[i:i + n]

if __name__ == '__main__':
    # Can use date for naming iterative runs...
    todays_date = date.today().strftime("%Y_%m_%d")
    # Instantiate DataScraper class
    ds = DataScraper()

    # Specify where you want to save output JSON and feature class on your machine
    out_json_path = rf"e:/data{todays_date}.json"
    out_fc_path = rf"e:/data{todays_date}.shp"

    # Function call to DataScraper class to get number of records, chunk up OIDs
    # and send requests to serivce. Results are compiled into single JSON and then
    # converted to shapefile/feature class at the end.
    def scrapeData(out_json_path, out_fc_path):
        try:
            service_props = ds.getServiceProperties(ds.service_url)
            max_record_count = service_props['maxRecordCount']
            layer_ids = ds.getLayerIds(ds.service_url)
            id_groups = list(ds.chunks(layer_ids, max_record_count))

            for i, id_group in enumerate(id_groups):
                print('  group {} of {}'.format(i+1, len(id_groups)))
                layer_data = ds.getLayerDataByIds(ds.service_url, id_group)

                if i==0: # If first iteration of id_groups
                    layer_data_final = layer_data
                else:
                    layer_data_final['features'].extend(layer_data['features'])

            print('Writing JSON file...')
            with open(out_json_path, 'w') as out_json_file:
                    json.dump(layer_data_final, out_json_file)

            print('Converting JSON to features...')
            arcpy.conversion.JSONToFeatures(out_json_path, out_fc_path)

        except Exception:
            # Handle errors accordingly...this is generic
            tb = sys.exc_info()[2]
            tb_info = traceback.format_tb(tb)[0]
            pymsg = f'PYTHON ERRORS:\n\tTraceback info:\t{tb_info}\n\tError Info:\t{str(sys.exc_info()[1])}\n'
            msgs = f'ArcPy ERRORS:\t{arcpy.GetMessages(2)}\n'
            print(pymsg)
            print(msgs)


# Run the function to get your data
scrapeData(out_json_path, out_fc_path)

View solution in original post

kapalczynski · ‎03-07-2023

I can manually do it here but I want to do this programmatically

AaronCole1 · ‎03-07-2023

I do this kind of thing all the time! Love data mining. You have to group your requests in batches of whatever the max number of records that will be returned is, in this case 1000. Here's a code snippet I modified from something I use regularly. You can set it up to run on task to be fully automated. You'll have to put in your desired output paths and it should run.

import arcpy
import requests
import json
import os
import sys
import traceback
from datetime import date

arcpy.env.overwriteOutput = True

class DataScraper():
    def __init__(self):
        # URL to map service you want to extract data from
        self.service_url = 'https://apps.deq.virginia.gov/arcgis/rest/services/public/EDMA/MapServer/104'

    def getServiceProperties(self, url):
        URL = url
        PARAMS = {'f' : 'json'}
        r = requests.get(url = URL, params = PARAMS)
        service_props = r.json()
        return service_props


    def getLayerIds(self, url, query=None):
        URL = url + '/query'
        PARAMS = {'f':'json', 'returnIdsOnly': True, 'where' : '1=1'}
        if query:
            PARAMS['where'] = "ST = '{}'".format(query)
        r = requests.get(url = URL, params = PARAMS)
        data = r.json()
        return data['objectIds']


    def getLayerDataByIds(self, url, ids):
        # ids parameter should be a list of object ids
        URL = url + '/query'
        field = 'OBJECTID'
        value = ', '.join([str(i) for i in ids])
        PARAMS = {'f': 'json', 'where': '{} IN ({})'.format(field, value), 'returnIdsOnly': False, 'returnCountOnly': False, 'returnGeometry': True,
                  'outFields': '*'}
        r = requests.post(url=URL, data=PARAMS)
        layer_data = r.json()
        return layer_data

    # UTILITIES
    def chunks(self, lst, n):
        # Yield successive n-sized chunks from list
        for i in range(0, len(lst), n):
            yield lst[i:i + n]

if __name__ == '__main__':
    # Can use date for naming iterative runs...
    todays_date = date.today().strftime("%Y_%m_%d")
    # Instantiate DataScraper class
    ds = DataScraper()

    # Specify where you want to save output JSON and feature class on your machine
    out_json_path = rf"e:/data{todays_date}.json"
    out_fc_path = rf"e:/data{todays_date}.shp"

    # Function call to DataScraper class to get number of records, chunk up OIDs
    # and send requests to serivce. Results are compiled into single JSON and then
    # converted to shapefile/feature class at the end.
    def scrapeData(out_json_path, out_fc_path):
        try:
            service_props = ds.getServiceProperties(ds.service_url)
            max_record_count = service_props['maxRecordCount']
            layer_ids = ds.getLayerIds(ds.service_url)
            id_groups = list(ds.chunks(layer_ids, max_record_count))

            for i, id_group in enumerate(id_groups):
                print('  group {} of {}'.format(i+1, len(id_groups)))
                layer_data = ds.getLayerDataByIds(ds.service_url, id_group)

                if i==0: # If first iteration of id_groups
                    layer_data_final = layer_data
                else:
                    layer_data_final['features'].extend(layer_data['features'])

            print('Writing JSON file...')
            with open(out_json_path, 'w') as out_json_file:
                    json.dump(layer_data_final, out_json_file)

            print('Converting JSON to features...')
            arcpy.conversion.JSONToFeatures(out_json_path, out_fc_path)

        except Exception:
            # Handle errors accordingly...this is generic
            tb = sys.exc_info()[2]
            tb_info = traceback.format_tb(tb)[0]
            pymsg = f'PYTHON ERRORS:\n\tTraceback info:\t{tb_info}\n\tError Info:\t{str(sys.exc_info()[1])}\n'
            msgs = f'ArcPy ERRORS:\t{arcpy.GetMessages(2)}\n'
            print(pymsg)
            print(msgs)


# Run the function to get your data
scrapeData(out_json_path, out_fc_path)

kapalczynski · ‎03-07-2023

@AaronCole1 Dang man.... thats awesome.... this one is going into Source Safe..... Cheers Dude

DonMorrison1 · ‎03-07-2023

That's a very nice program @AaronCole1 . I've been doing a lot of scraping recently and have had good luck using FeatureClassToFeatureClass. It only works on one layer at a time, you have to feed it the REST service URL, and doesn't write to JSON like you want....

arcpy.conversion.FeatureClassToFeatureClass(in_url, os.path.dirname(out_fc), os.path.basename(out_fc))

kapalczynski · ‎03-07-2023

Update: Dont mind this remark

AaronCole1 · ‎03-07-2023

You should just be able to replace the out_fc_path variable with the path to your data...that could be an sde feature class. You'd point to your connection file, something like "P:/Folder/Folder/ConnectionFile.sde/GIS.DBO.FeatureClassName"

kapalczynski · ‎03-07-2023

Got it working... Rocking...

Just wrote it to a FGDB for now... but SDE next per your last comment.... CHEERS

import arcpy
import requests
import json
import os
import sys
import traceback
from datetime import date

arcpy.env.overwriteOutput = True

def copyData():
    # Set local variables
    inFeatures = 'https://apps.deq.virginia.gov/arcgis/rest/services/public/EDMA/MapServer/102'
    outLocation = r"C:\Users\xxx\Desktop\GIS_projects\TestingExportData\output.gdb"
    outFeatureClass = "TEST_PetTankFac"
    ## Add Expression
    #delimitedField = arcpy.AddFieldDelimiters(arcpy.env.workspace, "NAME")
    #expression = delimitedField + " = 'Post Office'"
         
    # Run FeatureClassToFeatureClass
    arcpy.conversion.FeatureClassToFeatureClass(inFeatures, outLocation, outFeatureClass)

if __name__ == '__main__':
    copyData()

kapalczynski · ‎03-07-2023

My question is the path and name are in one... its looking for 3 parameters

"P:/Folder/Folder/ConnectionFile.sde/GIS.DBO.FeatureClassName"

arcpy.conversion.FeatureClassToFeatureClass(inFeatures, outLocation, outFeatureClass

Anonymous User · ‎03-07-2023

it would be like this:

outLocation = "P:/Folder/Folder/ConnectionFile.sde"

outFeatureClass = 'FeatureClassName'