Local mode fails with custom framework estimator
Describe the bug
A clear and concise description of what the bug is.
When defining a custom estimator, remote training works but local training does not.
To reproduce
A clear, step-by-step set of instructions to reproduce the bug.
framework_local = myEstimator( image_name=container_image_uri, role=role, entry_point='code/train.py', output_path= '/'.join(input_data.split('/')[:-1])+'/output', train_instance_count=1, train_instance_type='local', hyperparameters=hyperparameters) framework_local.fit({'train':'file://data.parquet'}, logs=True) # <---- fails when using files.download_and_extract(uri=uri, path=environment.code_dir) with botocore.exceptions.ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden
where myEstimator is from:
from sagemaker.estimator import Framework class ToyotaEstimator(Framework): def __init__( self, entry_point, source_dir=None, .. .. ..
Expected behavior
A clear and concise description of what you expected to happen.
local mode should work if remote works
Screenshots or logs
If applicable, add screenshots or logs to help explain your problem.
Creating tmpptzb0bfs_algo-1-g4d94_1 ...
Attaching to tmpptzb0bfs_algo-1-g4d94_12mdone
algo-1-g4d94_1 | 2020-08-25 20:00:05,420 sagemaker-training-toolkit ERROR Reporting training FAILURE
algo-1-g4d94_1 | 2020-08-25 20:00:05,420 sagemaker-training-toolkit ERROR framework error:
algo-1-g4d94_1 | Traceback (most recent call last):
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/trainer.py", line 92, in train
algo-1-g4d94_1 | entry_point.run(
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/entry_point.py", line 92, in run
algo-1-g4d94_1 | files.download_and_extract(uri=uri, path=environment.code_dir)
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/files.py", line 131, in download_and_extract
algo-1-g4d94_1 | s3_download(uri, dst)
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/files.py", line 167, in s3_download
algo-1-g4d94_1 | s3.Bucket(bucket).download_file(key, dst)
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/boto3/s3/inject.py", line 244, in bucket_download_file
algo-1-g4d94_1 | return self.meta.client.download_file(
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/boto3/s3/inject.py", line 170, in download_file
algo-1-g4d94_1 | return transfer.download_file(
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/boto3/s3/transfer.py", line 307, in download_file
algo-1-g4d94_1 | future.result()
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/futures.py", line 106, in result
algo-1-g4d94_1 | return self._coordinator.result()
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/futures.py", line 265, in result
algo-1-g4d94_1 | raise self._exception
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/tasks.py", line 255, in _main
algo-1-g4d94_1 | self._submit(transfer_future=transfer_future, **kwargs)
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/download.py", line 340, in _submit
algo-1-g4d94_1 | response = client.head_object(
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/botocore/client.py", line 316, in _api_call
algo-1-g4d94_1 | return self._make_api_call(operation_name, kwargs)
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/botocore/client.py", line 635, in _make_api_call
algo-1-g4d94_1 | raise error_class(parsed_response, operation_name)
algo-1-g4d94_1 | botocore.exceptions.ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden
algo-1-g4d94_1 |
algo-1-g4d94_1 | An error occurred (403) when calling the HeadObject operation: Forbidden
tmpptzb0bfs_algo-1-g4d94_1 exited with code 1
Aborting on container exit...
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name)
160 try:
--> 161 _stream_output(process)
162 except RuntimeError as e:
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/image.py in _stream_output(process)
676 if exit_code != 0:
--> 677 raise RuntimeError("Process exited with code: %s" % exit_code)
678
RuntimeError: Process exited with code: 1
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
<ipython-input-22-059e808d1544> in <module>()
10 train_config = sagemaker.session.s3_input(input_data, content_type='application/x-parquet')
11
---> 12 local_framework.fit({'train':train_config}, logs=True)
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
491 self._prepare_for_training(job_name=job_name)
492
--> 493 self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config)
494 self.jobs.append(self.latest_training_job)
495 if wait:
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs, experiment_config)
1058 train_args["enable_sagemaker_metrics"] = estimator.enable_sagemaker_metrics
1059
-> 1060 estimator.sagemaker_session.train(**train_args)
1061
1062 return cls(estimator.sagemaker_session, estimator._current_job_name)
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image, algorithm_arn, encrypt_inter_container_traffic, train_use_spot_instances, checkpoint_s3_uri, checkpoint_local_path, experiment_config, debugger_rule_configs, debugger_hook_config, tensorboard_output_config, enable_sagemaker_metrics)
588 LOGGER.info("Creating training-job with name: %s", job_name)
589 LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
--> 590 self.sagemaker_client.create_training_job(**train_request)
591
592 def process(
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs)
100 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
101 logger.info("Starting training job")
--> 102 training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName)
103
104 LocalSagemakerClient._training_jobs[TrainingJobName] = training_job
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, job_name)
94
95 self.model_artifacts = self.container.train(
---> 96 input_data_config, output_data_config, hyperparameters, job_name
97 )
98 self.end_time = datetime.datetime.now()
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name)
164 # which contains the exit code and append the command line to it.
165 msg = "Failed to run: %s, %s" % (compose_command, str(e))
--> 166 raise RuntimeError(msg)
167 finally:
168 artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name)
RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmpptzb0bfs/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1Also fails when I
- Add source.zip to S3
- Point to S3 data vs local data
System information
A description of your system. Please provide:
- SageMaker Python SDK version: latest
- Framework name (eg. PyTorch) or algorithm (eg. KMeans): custom
- Framework version:
- Python version:
- CPU or GPU:
- Custom Docker image (Y/N): Y
Additional context
Add any other context about the problem here.