From f89c0474f7373b24dc8c394c9468c25eeb3ec06f Mon Sep 17 00:00:00 2001 From: Will Badr Date: Mon, 8 Jun 2020 19:54:13 +1200 Subject: [PATCH 1/5] adding support for filters to TransformStep --- src/stepfunctions/steps/sagemaker.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/stepfunctions/steps/sagemaker.py b/src/stepfunctions/steps/sagemaker.py index dba0b64..35144b4 100644 --- a/src/stepfunctions/steps/sagemaker.py +++ b/src/stepfunctions/steps/sagemaker.py @@ -115,7 +115,7 @@ class TransformStep(Task): Creates a Task State to execute a `SageMaker Transform Job `_. """ - def __init__(self, state_id, transformer, job_name, model_name, data, data_type='S3Prefix', content_type=None, compression_type=None, split_type=None, experiment_config=None, wait_for_completion=True, tags=None, **kwargs): + def __init__(self, state_id, transformer, job_name, model_name, data, data_type='S3Prefix', content_type=None, compression_type=None, split_type=None, input_filter=None, output_filter=None, join_source=None, experiment_config=None, wait_for_completion=True, tags=None, **kwargs): """ Args: state_id (str): State name whose length **must be** less than or equal to 128 unicode characters. State names **must be** unique within the scope of the whole state machine. @@ -133,6 +133,9 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type= content_type (str): MIME type of the input data (default: None). compression_type (str): Compression type of the input data, if compressed (default: None). Valid values: 'Gzip', None. split_type (str): The record delimiter for the input object (default: 'None'). Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'. + input_filter (str): A JSONPath to select a portion of the input to pass to the algorithm container for inference. If you omit the field, it gets the value ‘$’, representing the entire input. For CSV data, each row is taken as a JSON array, so only index-based JSONPaths can be applied, e.g. $[0], $[1:]. CSV data should follow the RFC format. See Supported JSONPath Operators for a table of supported JSONPath operators. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.features” (default: None). + output_filter (str): A JSONPath to select a portion of the joined/original output to return as the output. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.prediction” (default: None). + join_source (str): The source of data to be joined to the transform output. It can be set to ‘Input’ meaning the entire input record will be joined to the inference result. You can use OutputFilter to select the useful portion before uploading to S3. (default: None). Valid values: Input, None. experiment_config (dict, optional): Specify the experiment config for the transform. (Default: None) wait_for_completion(bool, optional): Boolean value set to `True` if the Task state should wait for the transform job to complete before proceeding to the next step in the workflow. Set to `False` if the Task state should submit the transform job and proceed to the next step. (default: True) tags (list[dict], optional): `List to tags `_ to associate with the resource. @@ -150,7 +153,10 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type= content_type=content_type, compression_type=compression_type, split_type=split_type, - job_name=job_name + job_name=job_name, + input_filter=None, + output_filter=None, + join_source=None ) else: parameters = transform_config( @@ -159,7 +165,10 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type= data_type=data_type, content_type=content_type, compression_type=compression_type, - split_type=split_type + split_type=split_type, + input_filter=None, + output_filter=None, + join_source=None ) if isinstance(job_name, (ExecutionInput, StepInput)): From 3ef4a7decedcb40fdd7b76ca2b16d36915d7b16d Mon Sep 17 00:00:00 2001 From: Will Badr Date: Mon, 8 Jun 2020 20:22:17 +1200 Subject: [PATCH 2/5] adding support for filters to TransformStep with unit tests --- tests/unit/test_sagemaker_steps.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/test_sagemaker_steps.py b/tests/unit/test_sagemaker_steps.py index c7ca95e..5275a37 100644 --- a/tests/unit/test_sagemaker_steps.py +++ b/tests/unit/test_sagemaker_steps.py @@ -390,6 +390,9 @@ def test_transform_step_creation(pca_transformer): 'TrialComponentDisplayName': 'Transform' }, tags=DEFAULT_TAGS, + join_source='Input', + output_filter='$[2:]', + input_filter='$[1:]' ) assert step.to_dict() == { 'Type': 'Task', @@ -416,6 +419,11 @@ def test_transform_step_creation(pca_transformer): 'TrialName': 'pca_trial', 'TrialComponentDisplayName': 'Transform' }, + 'DataPreProcessing': { + 'InputFilter': '$[1:]', + 'OutputFilter': '$[2:]', + 'JoinSource': 'Input', + }, 'Tags': DEFAULT_TAGS_LIST }, 'Resource': 'arn:aws:states:::sagemaker:createTransformJob.sync', From 816e50a1ea4dce41d92effca89fe63ff03851754 Mon Sep 17 00:00:00 2001 From: Will Badr Date: Mon, 8 Jun 2020 20:39:16 +1200 Subject: [PATCH 3/5] adding support for filters to TransfomStep --- src/stepfunctions/steps/sagemaker.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/stepfunctions/steps/sagemaker.py b/src/stepfunctions/steps/sagemaker.py index 35144b4..06db8c7 100644 --- a/src/stepfunctions/steps/sagemaker.py +++ b/src/stepfunctions/steps/sagemaker.py @@ -154,9 +154,9 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type= compression_type=compression_type, split_type=split_type, job_name=job_name, - input_filter=None, - output_filter=None, - join_source=None + input_filter=input_filter, + output_filter=output_filter, + join_source=join_source ) else: parameters = transform_config( @@ -166,9 +166,9 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type= content_type=content_type, compression_type=compression_type, split_type=split_type, - input_filter=None, - output_filter=None, - join_source=None + input_filter=input_filter, + output_filter=output_filter, + join_source=join_source ) if isinstance(job_name, (ExecutionInput, StepInput)): From e330a1b99ae36ddf89dec44873703193270ffd1e Mon Sep 17 00:00:00 2001 From: Will Badr Date: Mon, 8 Jun 2020 20:57:32 +1200 Subject: [PATCH 4/5] Fixing a type in the unit test --- tests/unit/test_sagemaker_steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_sagemaker_steps.py b/tests/unit/test_sagemaker_steps.py index 5275a37..095f9f8 100644 --- a/tests/unit/test_sagemaker_steps.py +++ b/tests/unit/test_sagemaker_steps.py @@ -419,7 +419,7 @@ def test_transform_step_creation(pca_transformer): 'TrialName': 'pca_trial', 'TrialComponentDisplayName': 'Transform' }, - 'DataPreProcessing': { + 'DataProcessing': { 'InputFilter': '$[1:]', 'OutputFilter': '$[2:]', 'JoinSource': 'Input', From fb8efdecc4c12e969d30d0a4e5fb5b1160c3af29 Mon Sep 17 00:00:00 2001 From: Will Badr Date: Tue, 9 Jun 2020 09:37:57 +1200 Subject: [PATCH 5/5] reordering the new arguments for backward compatibility --- src/stepfunctions/steps/sagemaker.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/stepfunctions/steps/sagemaker.py b/src/stepfunctions/steps/sagemaker.py index 06db8c7..edaa64c 100644 --- a/src/stepfunctions/steps/sagemaker.py +++ b/src/stepfunctions/steps/sagemaker.py @@ -115,7 +115,7 @@ class TransformStep(Task): Creates a Task State to execute a `SageMaker Transform Job `_. """ - def __init__(self, state_id, transformer, job_name, model_name, data, data_type='S3Prefix', content_type=None, compression_type=None, split_type=None, input_filter=None, output_filter=None, join_source=None, experiment_config=None, wait_for_completion=True, tags=None, **kwargs): + def __init__(self, state_id, transformer, job_name, model_name, data, data_type='S3Prefix', content_type=None, compression_type=None, split_type=None, experiment_config=None, wait_for_completion=True, tags=None, input_filter=None, output_filter=None, join_source=None, **kwargs): """ Args: state_id (str): State name whose length **must be** less than or equal to 128 unicode characters. State names **must be** unique within the scope of the whole state machine. @@ -133,12 +133,12 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type= content_type (str): MIME type of the input data (default: None). compression_type (str): Compression type of the input data, if compressed (default: None). Valid values: 'Gzip', None. split_type (str): The record delimiter for the input object (default: 'None'). Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'. - input_filter (str): A JSONPath to select a portion of the input to pass to the algorithm container for inference. If you omit the field, it gets the value ‘$’, representing the entire input. For CSV data, each row is taken as a JSON array, so only index-based JSONPaths can be applied, e.g. $[0], $[1:]. CSV data should follow the RFC format. See Supported JSONPath Operators for a table of supported JSONPath operators. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.features” (default: None). - output_filter (str): A JSONPath to select a portion of the joined/original output to return as the output. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.prediction” (default: None). - join_source (str): The source of data to be joined to the transform output. It can be set to ‘Input’ meaning the entire input record will be joined to the inference result. You can use OutputFilter to select the useful portion before uploading to S3. (default: None). Valid values: Input, None. experiment_config (dict, optional): Specify the experiment config for the transform. (Default: None) wait_for_completion(bool, optional): Boolean value set to `True` if the Task state should wait for the transform job to complete before proceeding to the next step in the workflow. Set to `False` if the Task state should submit the transform job and proceed to the next step. (default: True) tags (list[dict], optional): `List to tags `_ to associate with the resource. + input_filter (str): A JSONPath to select a portion of the input to pass to the algorithm container for inference. If you omit the field, it gets the value ‘$’, representing the entire input. For CSV data, each row is taken as a JSON array, so only index-based JSONPaths can be applied, e.g. $[0], $[1:]. CSV data should follow the RFC format. See Supported JSONPath Operators for a table of supported JSONPath operators. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.features” (default: None). + output_filter (str): A JSONPath to select a portion of the joined/original output to return as the output. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.prediction” (default: None). + join_source (str): The source of data to be joined to the transform output. It can be set to ‘Input’ meaning the entire input record will be joined to the inference result. You can use OutputFilter to select the useful portion before uploading to S3. (default: None). Valid values: Input, None. """ if wait_for_completion: kwargs[Field.Resource.value] = 'arn:aws:states:::sagemaker:createTransformJob.sync'