How to prevent 408 errors for PutMetricData?

0

We're using CloudWatch metrics and we are experiencing errors with PutMetricData (via the Rust aws_sdk_cloudwatch crate). I would like to understand why this error is occurring so that I can adjust how I send metrics so that it works successfully.

The debug error looks like the following, after what appears to be a 10 second time out:

Err(ServiceError(ServiceError { source: Unhandled(Unhandled { source: XmlDecodeError { kind: Custom("no root element") }, meta: ErrorMetadata { code: None, message: None, extras: None } }), raw: Response { status: StatusCode(408), headers: Headers { headers: {"content-length": HeaderValue { _private: H0("0") }, "date": HeaderValue { _private: H0("Mon, 09 Sep 2024 20:55:45 GMT") }, "connection": HeaderValue { _private: H0("close") }} }, body: SdkBody { inner: Once(Some(b"")), retryable: true }, extensions: Extensions { extensions_02x: Extensions, extensions_1x: Extensions } } }))

Here is an executable that replicates this error when attempting to publish 125 value/count pairs, but succeeds when truncated to 100. A helper method at the bottom is producing and sending the payloads to PutMetricData, and logging out some basic stats.

use aws_sdk_cloudwatch::config::BehaviorVersion;
use aws_sdk_cloudwatch::types::Dimension;
use aws_sdk_cloudwatch::types::MetricDatum;
use aws_sdk_cloudwatch::types::StandardUnit;
use itertools::Itertools;
use tokio::time::Instant;

#[tokio::main]
async fn main() {
    let aws_config = aws_config::defaults(BehaviorVersion::latest()).load().await;
    let cw_client = aws_sdk_cloudwatch::Client::new(&aws_config);

    let values: Vec<f64> = vec![
        98541.0, 67026.0, 96172.0, 81257.0, 78637.0, 79622.0, 83649.0, 64732.0, 79871.0, 80260.0,
        80244.0, 99961.0, 102978.0, 78736.0, 79897.0, 79404.0, 100405.0, 80017.0, 79846.0, 83623.0,
        236806.0, 95570.0, 80385.0, 96966.0, 81295.0, 80134.0, 80069.0, 99614.0, 83302.0, 97889.0,
        79993.0, 80895.0, 80265.0, 79640.0, 78472.0, 80432.0, 97514.0, 80307.0, 84168.0, 80746.0,
        100452.0, 97142.0, 100383.0, 84088.0, 81623.0, 79927.0, 158122.0, 80490.0, 96406.0,
        96826.0, 96377.0, 80365.0, 81757.0, 97900.0, 78214.0, 80996.0, 78191.0, 63341.0, 80673.0,
        79601.0, 80193.0, 72833.0, 83016.0, 97199.0, 80191.0, 64229.0, 96476.0, 98516.0, 100234.0,
        96348.0, 83757.0, 83779.0, 66457.0, 101027.0, 98412.0, 205606.0, 81428.0, 99704.0, 96764.0,
        80619.0, 79670.0, 80629.0, 80585.0, 82882.0, 79790.0, 96866.0, 98877.0, 99769.0, 80609.0,
        96559.0, 79826.0, 73419.0, 80898.0, 79546.0, 96888.0, 78647.0, 78940.0, 96223.0, 80476.0,
        95708.0, 79921.0, 95619.0, 79476.0, 64980.0, 79883.0, 79628.0, 98414.0, 66497.0, 80538.0,
        98805.0, 81404.0, 65983.0, 78924.0, 83488.0, 96061.0, 77200.0, 98165.0, 81412.0, 62855.0,
        80378.0, 83776.0, 80089.0, 95655.0, 80388.0, 79382.0,
    ];
    let counts: Vec<f64> = vec![
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
    ];
    test(
        "Test 1, Original data",
        vec![(values, counts, "TestMessage")],
        &cw_client,
    )
    .await;

    let values: Vec<f64> = vec![
        98541.0, 67026.0, 96172.0, 81257.0, 78637.0, 79622.0, 83649.0, 64732.0, 79871.0, 80260.0,
        80244.0, 99961.0, 102978.0, 78736.0, 79897.0, 79404.0, 100405.0, 80017.0, 79846.0, 83623.0,
        236806.0, 95570.0, 80385.0, 96966.0, 81295.0, 80134.0, 80069.0, 99614.0, 83302.0, 97889.0,
        79993.0, 80895.0, 80265.0, 79640.0, 78472.0, 80432.0, 97514.0, 80307.0, 84168.0, 80746.0,
        100452.0, 97142.0, 100383.0, 84088.0, 81623.0, 79927.0, 158122.0, 80490.0, 96406.0,
        96826.0, 96377.0, 80365.0, 81757.0, 97900.0, 78214.0, 80996.0, 78191.0, 63341.0, 80673.0,
        79601.0, 80193.0, 72833.0, 83016.0, 97199.0, 80191.0, 64229.0, 96476.0, 98516.0, 100234.0,
        96348.0, 83757.0, 83779.0, 66457.0, 101027.0, 98412.0, 205606.0, 81428.0, 99704.0, 96764.0,
        80619.0, 79670.0, 80629.0, 80585.0, 82882.0, 79790.0, 96866.0, 98877.0, 99769.0, 80609.0,
        96559.0, 79826.0, 73419.0, 80898.0, 79546.0, 96888.0, 78647.0, 78940.0, 96223.0, 80476.0,
        95708.0,
    ];
    let counts: Vec<f64> = vec![
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
    ];
    test(
        "Test 2, truncated to 100 pairs",
        vec![(values.clone(), counts.clone(), "TestMessage")],
        &cw_client,
    )
    .await;

    test(
        "Test 3, two sets, truncated to 100 pairs",
        vec![
            (values.clone(), counts.clone(), "TestMessage"),
            (values, counts, "OtherMessage"),
        ],
        &cw_client,
    )
    .await;
}

async fn test(
    label: &str,
    metrics: Vec<(Vec<f64>, Vec<f64>, &str)>,
    cw_client: &aws_sdk_cloudwatch::Client,
) {
    println!("{label}:");
    let start = Instant::now();
    let total_pairs: usize = metrics
        .iter()
        .map(|(values, _, _)| values.len())
        .sum::<usize>();
    let total_count: f64 = metrics
        .iter()
        .map(|(_, counts, _)| counts.iter().sum::<f64>())
        .sum::<f64>();
    let metric_datums = metrics
        .into_iter()
        .map(|(values, counts, name)| {
            MetricDatum::builder()
                .metric_name("ProcessingTime")
                .dimensions(Dimension::builder().name("MessageName").value(name).build())
                .dimensions(
                    Dimension::builder()
                        .name("ProcessingContext")
                        .value("test")
                        .build(),
                )
                .set_values(Some(values))
                .set_counts(Some(counts))
                .unit(StandardUnit::Milliseconds)
                .build()
        })
        .collect_vec();
    let metric_datum_count = metric_datums.len();

    let res = cw_client
        .put_metric_data()
        .namespace("Playground")
        .set_metric_data(Some(metric_datums))
        .send()
        .await;

    println!("Metric datums: {metric_datum_count}");
    println!("Total value/count pairs: {total_pairs}");
    println!("Total datapoint count: {total_count}");
    println!("Elapsed time: {}ms", start.elapsed().as_millis());
    println!("Result: {res:#?}");
    println!("------");
}

I am confused why errors might be happening, since the docs say:

You can publish either individual values in the Value field, or arrays of values and the number of times each value occurred during the period by using the Values and Counts fields in the MetricData structure. Using the Values and Counts method enables you to publish up to 150 values per metric with one PutMetricData request, and supports retrieving percentile statistics on this data.

Each PutMetricData request is limited to 1 MB in size for HTTP POST requests. You can send a payload compressed by gzip. Each request is also limited to no more than 1000 different metrics.

and I don't believe this should be close to either limit.

The full program output is here:

Test 1: Original data:
Metric datums: 1
Total value/count pairs: 125
Total datapoint count: 128
Elapsed time: 10067ms
Result: Err(
    ServiceError(
        ServiceError {
            source: Unhandled(
                Unhandled {
                    source: XmlDecodeError {
                        kind: Custom(
                            "no root element",
                        ),
                    },
                    meta: ErrorMetadata {
                        code: None,
                        message: None,
                        extras: None,
                    },
                },
            ),
            raw: Response {
                status: StatusCode(
                    408,
                ),
                headers: Headers {
                    headers: {
                        "content-length": HeaderValue {
                            _private: H0(
                                "0",
                            ),
                        },
                        "date": HeaderValue {
                            _private: H0(
                                "Tue, 10 Sep 2024 03:35:31 GMT",
                            ),
                        },
                        "connection": HeaderValue {
                            _private: H0(
                                "close",
                            ),
                        },
                    },
                },
                body: SdkBody {
                    inner: Once(
                        Some(
                            b"",
                        ),
                    ),
                    retryable: true,
                },
                extensions: Extensions {
                    extensions_02x: Extensions,
                    extensions_1x: Extensions,
                },
            },
        },
    ),
)
------
Test 2: truncated to 100 pairs:
Metric datums: 1
Total value/count pairs: 100
Total datapoint count: 103
Elapsed time: 57ms
Result: Ok(
    PutMetricDataOutput {
        _request_id: Some(
            "15911e81-5917-4603-918b-a36161d14ce4",
        ),
    },
)
------
Test 3: two sets, truncated to 100 pairs:
Metric datums: 2
Total value/count pairs: 200
Total datapoint count: 206
Elapsed time: 10018ms
Result: Err(
    ServiceError(
        ServiceError {
            source: Unhandled(
                Unhandled {
                    source: XmlDecodeError {
                        kind: Custom(
                            "no root element",
                        ),
                    },
                    meta: ErrorMetadata {
                        code: None,
                        message: None,
                        extras: None,
                    },
                },
            ),
            raw: Response {
                status: StatusCode(
                    408,
                ),
                headers: Headers {
                    headers: {
                        "content-length": HeaderValue {
                            _private: H0(
                                "0",
                            ),
                        },
                        "date": HeaderValue {
                            _private: H0(
                                "Tue, 10 Sep 2024 03:35:41 GMT",
                            ),
                        },
                        "connection": HeaderValue {
                            _private: H0(
                                "close",
                            ),
                        },
                    },
                },
                body: SdkBody {
                    inner: Once(
                        Some(
                            b"",
                        ),
                    ),
                    retryable: true,
                },
                extensions: Extensions {
                    extensions_02x: Extensions,
                    extensions_1x: Extensions,
                },
            },
        },
    ),
)
------
asked 4 months ago114 views
1 Answer
0

Hello,

The HTTP 408 Request Timeout error occurs when the client fails to complete the request within the specified time limit set by the server.

 RequestTimeoutException
    Problems with the request at the HTTP level. Reading the Request timed out.
    HTTP Status Code: 408

The common causes of errors are:

  1. Network Issues: Slow or unstable network connections can cause delays in transmitting data, leading to timeouts. This can happen due to high network latency, network congestion, or connectivity issues.

  2. Request Size: If the PutMetricData request payload is too large, it may take longer to transmit the data, increasing the chances of a timeout. CloudWatch has limits on the maximum size of a PutMetricData request, and exceeding these limits can lead to timeouts.

  3. Client-side Issues: Issues on the client-side can also contribute to timeouts. This could include slow or inefficient code, resource constraints (e.g., CPU, memory), or other processing bottlenecks that delay the request.

  4. Server-side Issues: Although less common, issues on the server-side can also lead to timeouts. This could include temporary service outages, overloaded servers, or other issues that cause the server to take longer to process requests.

  5. Throttling: CloudWatch has limits on the number of PutMetricData requests that can be made per second. If you exceed these limits, CloudWatch may start throttling your requests, potentially leading to timeouts.

[+] PutMetricData - https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_PutMetricData.html

[+] How do I avoid throttling when I call PutMetricData in the CloudWatch API? https://repost.aws/knowledge-center/cloudwatch-400-error-throttling

Please refer the following best practices, if it works for your use-case.

  1. Distribute your PutMetricData calls evenly over time instead of making bursts of calls in a short period. Use jitter (randomized delay) to spread out the calls.

  2. Combine as many metrics as possible into a single PutMetricData call, up to the limits of 1,000 metrics and 150 data points per call. This reduces the overall number of API calls required.

  3. Implement exponential backoff and jitter when retrying failed PutMetricData calls due to throttling or other transient errors.

  4. Monitor the CloudWatch service quotas for your AWS account and Region, and request a quota increase if needed to accommodate your metric publishing requirements without hitting throttling limits.

[+] Exponential Backoff And Jitter https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/

[+] Retry behavior - https://docs.aws.amazon.com/sdkref/latest/guide/feature-retry-behavior.html

[+] Retry with backoff pattern - https://docs.aws.amazon.com/prescriptive-guidance/latest/cloud-design-patterns/retry-backoff.html

[+] CloudWatch service quotas - https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/cloudwatch_limits.html


For detailed investigation, we would need to check the resources and other details with respect to your account. If you still have queries regarding this, I would request you to reach out to our support team by opening a support case with all the resource details, and we will investigate the issue further.

Thank you for your interest in re:Post community.

Best Regards, Ashish

AWS
SUPPORT ENGINEER
answered 4 months ago

You are not logged in. Log in to post an answer.

A good answer clearly answers the question and provides constructive feedback and encourages professional growth in the question asker.

Guidelines for Answering Questions