Select to view content in your preferred language

Error downloading large size hosted feature layer as fileGDB

2135
6
08-28-2020 07:30 AM
NareshAligeti1
Occasional Contributor

I am trying to download a 20GB feature layer as file GDB from AGOL. The large size is due to photo attachments. I followed the steps posted in this technical article but still cannot get the download to finish without error. I've also used JDownloader to download in segments but no luck. Can someone help me? Thanks.

0 Kudos
6 Replies
MarianneFarretta
Esri Contributor

Hi Naresh.  Can you share details about the error you're receiving?

0 Kudos
JRhodes
Frequent Contributor

Naari,

Have you tried the Python requests module to stream the download?

import requests
from pathlib import Path

replica_url = "https://services9.arcgis.com/iERBXXD4hiy1L6en/arcgis/rest/services/Example/FeatureServer/replicaFiles/my_replica.zip"

save_dir = Path("C://backup_utility//test.zip")

download_size = requests.get(url, stream=True).headers['Content-length']

with requests.get(replica_url, stream=True, timeout=30) as f:
     with open (save_dir, 'wb') as save:
          for chunk in f.iter_content(chunk_size = 1024*1024):
               save.write(chunk)
               size_on_disk = Path(save_dir).stat().st_size
               print(f"{size_on_disk} of {download_size} downloaded")
‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍
0 Kudos
NareshAligeti1
Occasional Contributor

Thanks, Joe. I'll check it out.

0 Kudos
JRhodes
Frequent Contributor

Just noticed Line 8 should be:

with requests.get(replica_url, stream=True, timeout=30) as f:

0 Kudos
JRhodes
Frequent Contributor

Oops, I mean Line 8 should be:

download_size = requests.get(replica_url, stream=True).headers['Content-length']

0 Kudos
JRhodes
Frequent Contributor

Revisiting with another potential solution in case anyone stumbles on this. I've been having success creating replicas in "chunks" and exporting the service in smaller pieces, then merging them back together in Pro.

 

import arcgis.features
from arcgis import GIS
import urllib
import json
import wget #you could also use requests if preferred
import sys

###### USER VARIABLES ######
org_url = 'https://your_org.maps.arcgis.com'
username = 'your_username' #admin
password = 'your_pa$$word' #admin
save_directory = "C://pyTest//"
item_ids_to_chunk = ["e9bec92473644fe0b95f2779b9cd5b15",
					 "e9bec92473644fe0b95f2779b9cd5b15"]
chunk_size = 1000 #adjust as appropriate for your data
############################

feature_iterator = chunk_size
replace_list = [r' ', r'/', r':', r';', '\\', '*', '[', ']', '`', '~', '|', ',', '"', '.']

try:
	gis = GIS(org_url, username, password)
	print(f"Authenticated for {org_url}")
except:
	print(f"Could not authenticate for {org_url}. Check credentials.")
	sys.exit()

def sendRequest(request):
	response = urllib.request.urlopen(request)
	readResponse = response.read()
	jsonResponse= json.loads(readResponse)
	return jsonResponse

#enable sync and/or extract if necessary so replica can be created
def enableSyncExtract(itemFLC, token):
	_item_id = itemFLC.properties.serviceItemId
	_item = gis.content.get(_item_id)
	_item_url = _item.url
	capabilities_initial = itemFLC.properties.capabilities
	adm_url = _item_url.replace("/arcgis/rest/services/","/arcgis/rest/admin/services/")
	update_url = adm_url.replace("/FeatureServer","/FeatureServer/updateDefinition")
	rest = f"{update_url}?token={token}"
					
	if not itemFLC.properties.syncEnabled and not "Extract" in itemFLC.properties.capabilities:
		print("Enabling sync and extract")
		capabilities = f"{capabilities_initial},Sync,Extract"
		syncEnabled = "true"
		info = {"updateDefinition": {"capabilities" : capabilities,
									 "syncEnabled" : syncEnabled},
				"f": "json",
				"async": "false",
				"token": token}
		data = urllib.parse.urlencode(info).encode()
		req =  urllib.request.Request(rest, data=data)
		response = sendRequest(req)
		return capabilities_initial

	elif not itemFLC.properties.syncEnabled and "Extract" in itemFLC.properties.capabilities:
		print("Enabling sync")
		capabilities = f"{capabilities_initial},Sync"
		syncEnabled = "true"
		info = {"updateDefinition": {"capabilities" : capabilities,
									 "syncEnabled" : syncEnabled},
				"f": "json",
				"async": "false",
				"token": token}
		data = urllib.parse.urlencode(info).encode()
		req =  urllib.request.Request(rest, data=data)
		response = sendRequest(req)
		return capabilities_initial

	elif itemFLC.properties.syncEnabled and not "Extract" in itemFLC.properties.capabilities:
		print("Enabling extract")
		capabilities = f"{capabilities_initial},Extract"
		syncEnabled = "true"
		info = {"updateDefinition": {"capabilities" : capabilities,
									 "syncEnabled" : syncEnabled},
				"f": "json",
				"async": "false",
				"token": token}
		data = urllib.parse.urlencode(info).encode()
		req =  urllib.request.Request(rest, data=data)
		response = sendRequest(req)
		return capabilities_initial
	else:
		return

#reset sync and/or extract
def inlineSyncExtractReset(capabilities_initial, _id):
	_item = gis.content.get(_id)
	resetFLC = arcgis.features.FeatureLayerCollection(_item.url, gis)
	update_dict = {"capabilities": capabilities_initial}
	resetFLC.manager.update_definition(update_dict)

def generateToken():
    url = "https://arcgis.com/sharing/rest/generateToken"
    data = {'username' : username,
            'password' : password,
            'referer' : "https://www.arcgis.com",
            'f' : 'json'}
    request = urllib.request.Request(url, urllib.parse.urlencode(data).encode("utf-8"))
    jsonResponse = sendRequest(request)
    token = jsonResponse['token']
    if token:
    	print("Token successfully obtained")
    	return token
    else:
    	print("Could not obtain token. Exiting.")
    	sys.exit()

token = generateToken()

for _id in item_ids_to_chunk:
	item_layers = []
	item = gis.content.get(_id)
	print(f"\n-------------------------------------------\nStarting chunked backup for {item.title} ({item.id})")
	itemFLC = arcgis.features.FeatureLayerCollection(item.url, gis)
	
	try:
		capabilities_initial = enableSyncExtract(itemFLC, token)
	except:
		print(f"Could not verify or enable sync/extract for {item.title} ({item.id}). Exiting.")
		sys.exit()

	for l in itemFLC.layers:
		item_layers.append(l.properties.id)
	for t in itemFLC.tables:
		if not t.properties.name == 'GDB_ServiceItems':
			item_layers.append(t.properties.id)

	for layer in item_layers:
		chunk = 1
		start_record = 0
		chunk_size = feature_iterator
		get_feature_count_url = f"{item.url}/{layer}/query?where=1%3D1&returnIdsOnly=true&f=pjson&token={token}"
		request = urllib.request.Request(get_feature_count_url)
		jsonResponse = sendRequest(request)
		oid_list = jsonResponse.get('objectIds')
		count = max(oid_list)
		chunk_mod = count % chunk_size
		rest_of_chunks = count - chunk_mod

		if chunk_mod == 0:
			chunk_count = rest_of_chunks/chunk_size
		else:	
			chunk_count = (rest_of_chunks/chunk_size) + 1
		print(f"Exporting {int(chunk_count)} chunks for Layer {layer} of {item.title} ({item.id})\n-------------------------------------------")
		
		while chunk <= chunk_count:
			print(f"Exporting Chunk {chunk}: ObjectID {start_record} through {chunk_size}")
			layer_query = r'{"' + f'{layer}' + r'":{"where":"OBJECTID BETWEEN' + \
						  f' {start_record} AND {chunk_size}' + r'"}}'
			replicaURL = f"{item.url}/createReplica"
			data = {'f': 'json',
					'replicaName': item.title.replace(" ", "_"),
					'layers': layer,
					'layerQueries': layer_query,
					'returnAttachments': 'true',
					'syncModel': 'none',	
					'dataFormat': 'filegdb',
					'async': 'true',
					'token': token}
			request = urllib.request.Request(replicaURL, urllib.parse.urlencode(data).encode("utf-8"))
			jsonResponse = sendRequest(request)
			
			if not jsonResponse:
				print(f"Request for ObjectIDs {start_record} to {chunk_size} failed. Trying again.")
				jsonResponse = sendRequest(request)
				if not jsonResponse:
					print(f"Replica creation failed for {item.title} ({item.id}).\n")
					continue
			
			responseUrl = jsonResponse['statusUrl']
			url = f"{responseUrl}?f=json&token={token}"
			request = urllib.request.Request(url)
			jsonResponse = sendRequest(request)

			while not jsonResponse.get("status") == "Completed":
				if not jsonResponse.get("status") == "Failed":
					request = urllib.request.Request(url)
					jsonResponse = sendRequest(request)
				else:
					print(f"Replica creation failed for {item.title} ({item.id}).\n")
					continue

			jres = jsonResponse['resultUrl']
			url = f"{jres}?token={token}"
			item_title = item.title

			for r in replace_list:
				if r in item_title:
					item_title = item_title.replace(r, '_')

			save_dir = f"{save_directory}{item_title}_{item.id}_layer{layer}_chunk{chunk}.zip"
			wget.download(url, save_dir) #you could use requests if preferred
			start_record = (chunk * feature_iterator) + 1
			chunk_size = (chunk * feature_iterator) + feature_iterator
			chunk += 1
	
	if capabilities_initial:
		try:
			inlineSyncExtractReset(capabilities_initial, _id)
			print(f"Capability reset successful")
		except:
			print(f"***Capability reset failed for {item.title} ({item.id}). Reset manually (original capabilities: {capabilities_initial})***")

 

 

Tags (2)
0 Kudos