{"id":15782,"date":"2023-10-04T14:12:51","date_gmt":"2023-10-04T14:12:51","guid":{"rendered":"https:\/\/beta.bluetab.net\/lakehouse-streaming-en-aws-con-apache-flink-y-hudi-parte-2\/"},"modified":"2023-10-17T12:51:46","modified_gmt":"2023-10-17T12:51:46","slug":"lakehouse-streaming-on-aws-with-apache-flink-and-hudi-part-2","status":"publish","type":"post","link":"https:\/\/bluetab.es\/en\/lakehouse-streaming-on-aws-with-apache-flink-and-hudi-part-2\/","title":{"rendered":"LakeHouse Streaming on AWS with Apache Flink and Hudi (Part 2)"},"content":{"rendered":"\t\t<div data-elementor-type=\"wp-post\" data-elementor-id=\"15782\" class=\"elementor elementor-15782\" data-elementor-post-type=\"post\">\n\t\t\t\t\t\t<section class=\"elementor-section elementor-top-section elementor-element elementor-element-b2ad9fe elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"b2ad9fe\" data-element_type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-top-column elementor-element elementor-element-7eac278\" data-id=\"7eac278\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-45481b1 elementor-widget elementor-widget-heading\" data-id=\"45481b1\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h1 class=\"elementor-heading-title elementor-size-default\">LakeHouse Streaming on AWS with Apache Flink and Hudi (Part 2)<\/h1>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<section class=\"elementor-section elementor-inner-section elementor-element elementor-element-7843b8f elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"7843b8f\" data-element_type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-33 elementor-inner-column elementor-element elementor-element-1a4ee60\" data-id=\"1a4ee60\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-d5531c7 elementor-position-left elementor-vertical-align-middle elementor-widget elementor-widget-image-box\" data-id=\"d5531c7\" data-element_type=\"widget\" data-widget_type=\"image-box.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<div class=\"elementor-image-box-wrapper\"><figure class=\"elementor-image-box-img\"><a href=\"https:\/\/www.linkedin.com\/in\/albertojaenrevuelta\/\" target=\"_blank\" tabindex=\"-1\"><img decoding=\"async\" width=\"150\" height=\"150\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-150x150.png\" class=\"attachment-thumbnail size-thumbnail wp-image-13320 lazyload\" alt=\"\" data-srcset=\"https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-150x150.png 150w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-300x300.png 300w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-75x75.png 75w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-270x270.png 270w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-192x192.png 192w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-180x180.png 180w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-32x32.png 32w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab.png 512w\" data-sizes=\"(max-width: 150px) 100vw, 150px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 150px; --smush-placeholder-aspect-ratio: 150\/150;\" \/><\/a><\/figure><div class=\"elementor-image-box-content\"><h4 class=\"elementor-image-box-title\"><a href=\"https:\/\/www.linkedin.com\/in\/albertojaenrevuelta\/\" target=\"_blank\">Alberto Jaen<\/a><\/h4><p class=\"elementor-image-box-description\">AWS Cloud Engineer <\/p><\/div><\/div>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t<div class=\"elementor-column elementor-col-33 elementor-inner-column elementor-element elementor-element-a5a94df\" data-id=\"a5a94df\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-f643dea elementor-position-left elementor-vertical-align-middle elementor-widget elementor-widget-image-box\" data-id=\"f643dea\" data-element_type=\"widget\" data-widget_type=\"image-box.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<div class=\"elementor-image-box-wrapper\"><figure class=\"elementor-image-box-img\"><a href=\"https:\/\/www.linkedin.com\/in\/alfonsojerezizquierdo\/\" target=\"_blank\" tabindex=\"-1\"><img decoding=\"async\" width=\"150\" height=\"150\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-150x150.png\" class=\"attachment-thumbnail size-thumbnail wp-image-13320 lazyload\" alt=\"\" data-srcset=\"https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-150x150.png 150w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-300x300.png 300w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-75x75.png 75w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-270x270.png 270w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-192x192.png 192w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-180x180.png 180w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-32x32.png 32w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab.png 512w\" data-sizes=\"(max-width: 150px) 100vw, 150px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 150px; --smush-placeholder-aspect-ratio: 150\/150;\" \/><\/a><\/figure><div class=\"elementor-image-box-content\"><h4 class=\"elementor-image-box-title\"><a href=\"https:\/\/www.linkedin.com\/in\/alfonsojerezizquierdo\/\" target=\"_blank\">Alfonso Jerez<\/a><\/h4><p class=\"elementor-image-box-description\">AWS Cloud Engineer <\/p><\/div><\/div>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t<div class=\"elementor-column elementor-col-33 elementor-inner-column elementor-element elementor-element-2fa6cdb\" data-id=\"2fa6cdb\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-bae7214 elementor-position-left elementor-vertical-align-middle elementor-widget elementor-widget-image-box\" data-id=\"bae7214\" data-element_type=\"widget\" data-widget_type=\"image-box.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<div class=\"elementor-image-box-wrapper\"><figure class=\"elementor-image-box-img\"><a href=\"https:\/\/www.linkedin.com\/in\/adrianjimenezhernandez\/\" target=\"_blank\" tabindex=\"-1\"><img decoding=\"async\" width=\"150\" height=\"150\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-150x150.png\" class=\"attachment-thumbnail size-thumbnail wp-image-13320 lazyload\" alt=\"\" data-srcset=\"https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-150x150.png 150w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-300x300.png 300w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-75x75.png 75w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-270x270.png 270w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-192x192.png 192w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-180x180.png 180w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-32x32.png 32w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab.png 512w\" data-sizes=\"(max-width: 150px) 100vw, 150px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 150px; --smush-placeholder-aspect-ratio: 150\/150;\" \/><\/a><\/figure><div class=\"elementor-image-box-content\"><h4 class=\"elementor-image-box-title\"><a href=\"https:\/\/www.linkedin.com\/in\/adrianjimenezhernandez\/\" target=\"_blank\">Adri\u00e1n Jim\u00e9nez<\/a><\/h4><p class=\"elementor-image-box-description\">AWS Cloud Engineer <\/p><\/div><\/div>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t<section class=\"elementor-section elementor-inner-section elementor-element elementor-element-6562b00 elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"6562b00\" data-element_type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-100 elementor-inner-column elementor-element elementor-element-7c6d220\" data-id=\"7c6d220\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-dac8e8c elementor-share-buttons--view-icon elementor-share-buttons--skin-minimal elementor-share-buttons--shape-circle elementor-grid-0 elementor-share-buttons--color-official elementor-widget elementor-widget-share-buttons\" data-id=\"dac8e8c\" data-element_type=\"widget\" data-widget_type=\"share-buttons.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t<div class=\"elementor-grid\" role=\"list\">\n\t\t\t\t\t\t\t\t<div class=\"elementor-grid-item\" role=\"listitem\">\n\t\t\t\t\t\t<div class=\"elementor-share-btn elementor-share-btn_twitter\" role=\"button\" tabindex=\"0\" aria-label=\"Share on twitter\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<span class=\"elementor-share-btn__icon\">\n\t\t\t\t\t\t\t\t<i class=\"fab fa-twitter\" aria-hidden=\"true\"><\/i>\t\t\t\t\t\t\t<\/span>\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t\t\t\t\t\t\t\t<div class=\"elementor-grid-item\" role=\"listitem\">\n\t\t\t\t\t\t<div class=\"elementor-share-btn elementor-share-btn_linkedin\" role=\"button\" tabindex=\"0\" aria-label=\"Share on linkedin\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<span class=\"elementor-share-btn__icon\">\n\t\t\t\t\t\t\t\t<i class=\"fab fa-linkedin\" aria-hidden=\"true\"><\/i>\t\t\t\t\t\t\t<\/span>\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-top-column elementor-element elementor-element-2b67acd\" data-id=\"2b67acd\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap\">\n\t\t\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t<section class=\"elementor-section elementor-top-section elementor-element elementor-element-c738cdf elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"c738cdf\" data-element_type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-100 elementor-top-column elementor-element elementor-element-55b901d\" data-id=\"55b901d\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<section class=\"elementor-section elementor-inner-section elementor-element elementor-element-7668b05 elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"7668b05\" data-element_type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-inner-column elementor-element elementor-element-592ef24\" data-id=\"592ef24\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-6010029 elementor-widget elementor-widget-theme-post-featured-image elementor-widget-image\" data-id=\"6010029\" data-element_type=\"widget\" data-widget_type=\"theme-post-featured-image.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<img fetchpriority=\"high\" decoding=\"async\" width=\"1024\" height=\"512\" src=\"https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5-1024x512.png\" class=\"attachment-large size-large wp-image-17826\" alt=\"\" srcset=\"https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5-1024x512.png 1024w, https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5-300x150.png 300w, https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5-768x384.png 768w, https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5.png 1200w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/>\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-f28ddaa elementor-widget elementor-widget-heading\" data-id=\"f28ddaa\" data-element_type=\"widget\" id=\"intro\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">Introduction<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-4abbf0e elementor-widget elementor-widget-text-editor\" data-id=\"4abbf0e\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">This article is the second in a series of publications focusing on the <strong>creation of a LakeHouse with Hudi<\/strong> from a streaming ingest processed by a Flink application. The <a href=\"https:\/\/bluetab.net\/en\/lakehouse-streaming-on-aws-with-apache-flink-and-hudi-part-1\/\" target=\"_blank\" rel=\"noopener\">first article<\/a> focuses on laying a good foundation for this platform, where Flink applications were deployed with KDA (Kinesis Data Analytics) for each type of format (MoR, CoW for Hudi and JSON) that write the result of this processing into buckets.<\/span><\/p><p><span style=\"font-weight: 400;\">The input data was sent in the previous article from a local machine running a Locust application, which can present problems when scaling and processing a high volume of events. In addition, Kinesis Data Analytics applications with Flink present agility problems in their auto-scaling mode. All these new challenges will be solved in this article.<\/span><\/p><p><span style=\"font-weight: 400;\">These tables will also be cataloged in Glue, a service that provides a data catalog in AWS, in order to access them and perform queries of all kinds. The query engine that will consume this metadata will be Athena, which provides a scalable, agile and serverless experience to be able to execute queries with SQL or Spark for our tables hosted in S3.<\/span><\/p><p><span style=\"font-weight: 400;\">On the other hand,<strong> in this article we have also deployed the necessary components to be able to monitor our applications<\/strong> and thus draw conclusions about the speed at which data is ingested and the possible problems to be solved so that the processing has the required latency according to the requirements imposed.<\/span><\/p><p><span style=\"font-weight: 400;\">Finally, a performance and latency <strong>comparison of the different Flink<\/strong> applications that write data in Hudi and JSON formats will be made in order to see the different advantages and disadvantages of these formats.\u00a0<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-7aaf8a9 elementor-widget elementor-widget-heading\" data-id=\"7aaf8a9\" data-element_type=\"widget\" id=\"objetivo\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">Architecture<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-cffc795 elementor-widget elementor-widget-text-editor\" data-id=\"cffc795\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">Below you can see the high-level architecture that will be deployed:<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-c33a6cf elementor-widget elementor-widget-image\" data-id=\"c33a6cf\" data-element_type=\"widget\" data-widget_type=\"image.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<img decoding=\"async\" width=\"1024\" height=\"296\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image17-1024x296.png\" class=\"attachment-large size-large wp-image-15640 lazyload\" alt=\"\" data-srcset=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image17-1024x296.png 1024w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image17-300x87.png 300w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image17-768x222.png 768w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image17-1536x444.png 1536w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image17.png 1999w\" data-sizes=\"(max-width: 1024px) 100vw, 1024px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 1024px; --smush-placeholder-aspect-ratio: 1024\/296;\" \/>\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-ddf563e elementor-widget elementor-widget-text-editor\" data-id=\"ddf563e\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">For a better understanding we are going to explain it from left to right. As you can see, the most notable change with respect to the first article is <strong>the inclusion of a Kubernetes<\/strong> cluster to be able to scale the events that will be sent as input to our streaming application. In this way, it will be possible to thoroughly test the performance of Flink applications depending on their provisioning and especially on the type of format and table in which they write to the LakeHouse. In addition, an ALB (Application Load Balancer) has been made available to access the Locust interface to define the number of users to simulate and how they should scale over time. The URL to access this will appear as output when deploying the infrastructure with Terraform.<\/span><\/p><p><span style=\"font-weight: 400;\">On the other hand, significant <strong>changes have been made to the Flink KDA applications and the stream<\/strong> they read from. Each application now reads as EFO (Enhanced Fan Out) consumers, so that each of them has a dedicated bandwidth. The reason for this change and its details will be explained in more detail in the dedicated section for Kinesis.<\/span><\/p><p><span style=\"font-weight: 400;\">Regarding the monitoring and extraction of metrics in NRT (Near Real Time), lambdas functions have been deployed that query the tables based on Athena thanks to having registered the metadata of these tables in the Glue catalog. It is important to note that the metadata of Hudi tables are registered in Glue by Flink but in the case of JSON a crawler is deployed that registers these tables in the catalog. This crawler must be executed manually for this table to be registered in Glue.<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-7f116da elementor-widget elementor-widget-heading\" data-id=\"7f116da\" data-element_type=\"widget\" id=\"coste\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">Scaling<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-3fd4aa7 elementor-widget elementor-widget-text-editor\" data-id=\"3fd4aa7\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<h2><span style=\"font-weight: 400;\">Kinesis Stream<\/span><\/h2><p><span style=\"font-weight: 400;\">Since the goal is to subject the application to a considerable load of events per second, it is necessary to explain how each of the pieces of the architecture can scale according to the volume of data.<\/span><\/p><p><span style=\"font-weight: 400;\">As previously mentioned, a Kinesis Stream On-Demand has been chosen to automate the scaling of the shards during load testing. It should be noted that these streams can accommodate a write rate of up to 200% of that specified by the number of shards at any given time.<\/span><\/p><p><span style=\"font-weight: 400;\">Once the stream is above 100%, it will automatically increase the number of shards within 15 minutes. The only limitation is therefore not to exceed twice the supported write volume in less than that period.<\/span><\/p><p><span style=\"font-weight: 400;\">On the other hand, since you will have three Flink applications reading from the same stream, read limitations will be the biggest problem. A Kinesis Stream only supports 5 GetRecord calls per shard per second. Since each application has to read the entire stream (and therefore all shards), increasing the number of shards does not help to solve this problem.<\/span><\/p><p><span style=\"font-weight: 400;\">The solution is to register each application as an Enhanced Fan-Out consumer. This functionality of the Kinesis Stream provides each of these consumers with an individual limit of 5 GetRecord calls and 2MB per shard per second of reading.<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-57b15db elementor-widget elementor-widget-image\" data-id=\"57b15db\" data-element_type=\"widget\" data-widget_type=\"image.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<img decoding=\"async\" width=\"851\" height=\"501\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image7.png\" class=\"attachment-large size-large wp-image-15642 lazyload\" alt=\"\" data-srcset=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image7.png 851w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image7-300x177.png 300w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image7-768x452.png 768w\" data-sizes=\"(max-width: 851px) 100vw, 851px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 851px; --smush-placeholder-aspect-ratio: 851\/501;\" \/>\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-fb46eef elementor-widget elementor-widget-text-editor\" data-id=\"fb46eef\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">This configuration is done on the consumer side, in our case via the Kinesis connector for Flink:<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-7b80cd1 elementor-widget elementor-widget-elementor-syntax-highlighter\" data-id=\"7b80cd1\" data-element_type=\"widget\" data-widget_type=\"elementor-syntax-highlighter.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<pre><code class='language-python'>&#039;scan.stream.recordpublisher&#039; = &#039;EFO&#039;,\r\n&#039;scan.stream.efo.registration&#039; = &#039;EAGER\/LAZY&#039;,\r\n&#039;scan.stream.efo.consumername&#039; = &#039;{consumer_name}&#039; <\/code><\/pre><script>\nif (!document.getElementById('syntaxed-prism')) {\n\tvar my_awesome_script = document.createElement('script');\n\tmy_awesome_script.setAttribute('src','https:\/\/bluetab.es\/wp-content\/plugins\/syntax-highlighter-for-elementor\/assets\/prism2.js');\n\tmy_awesome_script.setAttribute('id','syntaxed-prism');\n\tdocument.body.appendChild(my_awesome_script);\n} else {\n\twindow.Prism && Prism.highlightAll();\n}\n<\/script>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-6c65ea8 elementor-widget elementor-widget-text-editor\" data-id=\"6c65ea8\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">It is worth mentioning that alternatively, it is possible to increase the read latency of our Flink applications. By default Flink performs a read every 200ms per shard, so one application completely consumes the read quota of a stream. By increasing this value to 600ms we could accommodate all three applications, at the cost of increased latency:<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-7910602 elementor-widget elementor-widget-elementor-syntax-highlighter\" data-id=\"7910602\" data-element_type=\"widget\" data-widget_type=\"elementor-syntax-highlighter.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<pre><code class='language-python'>scan.shard.getrecords.intervalmillis = &#039;600&#039; <\/code><\/pre><script>\nif (!document.getElementById('syntaxed-prism')) {\n\tvar my_awesome_script = document.createElement('script');\n\tmy_awesome_script.setAttribute('src','https:\/\/bluetab.es\/wp-content\/plugins\/syntax-highlighter-for-elementor\/assets\/prism2.js');\n\tmy_awesome_script.setAttribute('id','syntaxed-prism');\n\tdocument.body.appendChild(my_awesome_script);\n} else {\n\twindow.Prism && Prism.highlightAll();\n}\n<\/script>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-3611269 elementor-widget elementor-widget-text-editor\" data-id=\"3611269\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">Use will also be made of the Adaptive Reads option, which dynamically modifies the number of events collected per call depending on the size of each record. This makes it possible to take advantage of the 2 MB\/s per shard available for each consumer:<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-72fd9ab elementor-widget elementor-widget-elementor-syntax-highlighter\" data-id=\"72fd9ab\" data-element_type=\"widget\" data-widget_type=\"elementor-syntax-highlighter.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<pre><code class='language-python'>&#039;scan.shard.adaptivereads&#039; = &#039;true&#039; <\/code><\/pre><script>\nif (!document.getElementById('syntaxed-prism')) {\n\tvar my_awesome_script = document.createElement('script');\n\tmy_awesome_script.setAttribute('src','https:\/\/bluetab.es\/wp-content\/plugins\/syntax-highlighter-for-elementor\/assets\/prism2.js');\n\tmy_awesome_script.setAttribute('id','syntaxed-prism');\n\tdocument.body.appendChild(my_awesome_script);\n} else {\n\twindow.Prism && Prism.highlightAll();\n}\n<\/script>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-6057b15 elementor-widget elementor-widget-text-editor\" data-id=\"6057b15\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">Regarding scaling in Flink KPUs (Kinesis Processing Unit), we have chosen not to make use of autoscaling, since each scaling process incurs in downtime for the application. Due to the different requirements of each of the applications, scaling actions at unexpected times could interrupt load testing. In addition, it is interesting to measure the write performance of each of the applications at equal computing capacity.<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-bd8f0fe elementor-widget elementor-widget-heading\" data-id=\"bd8f0fe\" data-element_type=\"widget\" id=\"time\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">Hudi<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-94c83b5 elementor-widget elementor-widget-text-editor\" data-id=\"94c83b5\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<h3><span style=\"font-weight: 400;\">Timeline<\/span><\/h3><p><span style=\"font-weight: 400;\">One of the basic systems on which Hudi&#8217;s operation and features are based is the timeline. Hudi keeps a temporary record of all the actions that have been performed on the table, as well as the status of this action.<\/span><\/p><p><span style=\"font-weight: 400;\">The main actions that make up the timeline are as follows<\/span><\/p><ul><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Commits <\/b><span style=\"font-weight: 400;\">&#8211; atomic writing of a set of records to the table in columnar format<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Delta Commit <\/b><span style=\"font-weight: 400;\">&#8211; similar to commit, represents a write of records in the form of logs to a Merge on Read table.<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Compaction <\/b><span style=\"font-weight: 400;\">&#8211; compaction of log writes (delta commits) from a MoR table to columnar format<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Cleans <\/b><span style=\"font-weight: 400;\">&#8211; deletion of old versions of files<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Rollback <\/b><span style=\"font-weight: 400;\">&#8211; deleted from records written by a failed commit or delta commit<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Savepoint <\/b><span style=\"font-weight: 400;\">&#8211; marks a set of files as &#8220;saved&#8221; so that they will not be deleted by the cleanup process. Allows to restore the table to a previous point in the timeline.<\/span><\/li><\/ul><p><span style=\"font-weight: 400;\">Any of these actions can be found in one of three states<\/span><\/p><ol><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Requested <\/b><span style=\"font-weight: 400;\">&#8211; an action has been planned but not yet started<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Inflight <\/b><span style=\"font-weight: 400;\">&#8211; the action is in progress<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Completed <\/b><span style=\"font-weight: 400;\">&#8211; denotes that the action has been completed.<\/span><\/li><\/ol><h3><span style=\"font-weight: 400;\"><br \/>Table types<\/span><\/h3><p><span style=\"font-weight: 400;\">As hinted in the operation of the Hudi timeline, there are two types of writing supported: columnar and logs. The columnar (parquet) format constitutes the final form of a Hudi table, together with the timeline metadata. However, it is possible to make use of log writes (avro) to decrease the write latency and eventually compact to columnar format without hindering the write.<\/span><\/p><p><span style=\"font-weight: 400;\">The use of these writing methods gives rise to the two types of table that Hudi makes available to us<\/span><\/p><ul><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Copy on Write <\/b><span style=\"font-weight: 400;\">&#8211; writes are performed exclusively in columnar format, creating a new file with the new table records. The data is available immediately but incurs higher write latency.<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Merge on Read <\/b><span style=\"font-weight: 400;\">&#8211; makes use of writing to logs. The new records are initially written as logs, and will later be transformed to columnar format by the compaction process. We obtain lower write latency at the cost of read latency; the new logs will not be available until compaction is performed.<\/span><\/li><\/ul><h3><span style=\"font-weight: 400;\"><br \/>Query Types<\/span><\/h3><p><span style=\"font-weight: 400;\">In order to take advantage of the characteristics of each type of table, there are three types of queries that can be performed on a Hudi table<\/span><\/p><ul><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Snapshot <\/b><span style=\"font-weight: 400;\">&#8211; obtains the latest version of the table. For MoR tables this involves incurring a compaction process to get the latest records in log format.\u00a0<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Read Optimized <\/b><span style=\"font-weight: 400;\">&#8211; for MoR tables, reads only the records already exposed in columnar format without incurring additional read latency.<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Incremental <\/b><span style=\"font-weight: 400;\">&#8211; collects only new records since a certain commit or compact, facilitating the creation of incremental pipelines. Not supported by Athena<\/span><\/li><\/ul>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-631aaaa elementor-widget elementor-widget-image\" data-id=\"631aaaa\" data-element_type=\"widget\" data-widget_type=\"image.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<img decoding=\"async\" width=\"821\" height=\"361\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image1.png\" class=\"attachment-large size-large wp-image-15647 lazyload\" alt=\"\" data-srcset=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image1.png 821w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image1-300x132.png 300w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image1-768x338.png 768w\" data-sizes=\"(max-width: 821px) 100vw, 821px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 821px; --smush-placeholder-aspect-ratio: 821\/361;\" \/>\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-ce8c917 elementor-widget elementor-widget-text-editor\" data-id=\"ce8c917\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<h2><span style=\"font-weight: 400;\">Integration with Glue Catalog<\/span><\/h2><p><span style=\"font-weight: 400;\">The Hudi connector allows a native integration with the Glue catalog in AWS. Simply add the Hive dependencies in our Flink application:<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-d7bfae8 elementor-widget elementor-widget-elementor-syntax-highlighter\" data-id=\"d7bfae8\" data-element_type=\"widget\" data-widget_type=\"elementor-syntax-highlighter.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<pre><code class='language-python'>com.amazonaws.aws-java-sdk-glue\r\norg.apache.hive.hive-common\r\norg.apache.hive.hive-exec <\/code><\/pre><script>\nif (!document.getElementById('syntaxed-prism')) {\n\tvar my_awesome_script = document.createElement('script');\n\tmy_awesome_script.setAttribute('src','https:\/\/bluetab.es\/wp-content\/plugins\/syntax-highlighter-for-elementor\/assets\/prism2.js');\n\tmy_awesome_script.setAttribute('id','syntaxed-prism');\n\tdocument.body.appendChild(my_awesome_script);\n} else {\n\twindow.Prism && Prism.highlightAll();\n}\n<\/script>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-0aebff7 elementor-widget elementor-widget-text-editor\" data-id=\"0aebff7\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">And specify the catalog configuration in the Hudi connector<\/span><span style=\"font-weight: 400;\">:<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-110116e elementor-widget elementor-widget-elementor-syntax-highlighter\" data-id=\"110116e\" data-element_type=\"widget\" data-widget_type=\"elementor-syntax-highlighter.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<pre><code class='language-python'>&#039;hive_sync.enable&#039; = &#039;true&#039;,\r\n&#039;hive_sync.db&#039; = &#039;{glue_database}&#039;,\r\n&#039;hive_sync.table&#039; = &#039;{table_name}&#039;,\r\n&#039;hive_sync.partition_fields&#039; = &#039;{partition_fields}&#039;,\r\n&#039;hive_sync.mode&#039; = &#039;glue&#039;,\r\n&#039;hive_sync.use_jdbc&#039; = &#039;false&#039; <\/code><\/pre><script>\nif (!document.getElementById('syntaxed-prism')) {\n\tvar my_awesome_script = document.createElement('script');\n\tmy_awesome_script.setAttribute('src','https:\/\/bluetab.es\/wp-content\/plugins\/syntax-highlighter-for-elementor\/assets\/prism2.js');\n\tmy_awesome_script.setAttribute('id','syntaxed-prism');\n\tdocument.body.appendChild(my_awesome_script);\n} else {\n\twindow.Prism && Prism.highlightAll();\n}\n<\/script>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-61c32f2 elementor-widget elementor-widget-text-editor\" data-id=\"61c32f2\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">With this integration, the application will automatically create the tables in the catalog. As mentioned before, there are different types of queries to query a Hudi table. Therefore, different tables will be created in the catalog to support the different queries.<\/span><\/p><p><span style=\"font-weight: 400;\">For a CoW table, the table will be queried using a Snapshot query. For MoR on the other hand, two tables will be made available to support Read Optimized or Snapshot queries.<\/span><\/p><p><span style=\"font-weight: 400;\">The main application of Glue is to support lambdas so that when executing queries through Athena their execution can be done in a more efficient, fast and secure way:<\/span><\/p><ul><li style=\"font-weight: 400;\" aria-level=\"1\"><span style=\"font-weight: 400;\"><strong>Glue Catalog<\/strong>: centralized storage of information about the organization, design and format of the data, used by Athena to directly perform queries to S3 without having to rely on third parties to obtain this information.<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><span style=\"font-weight: 400;\"><strong>Schema Automation<\/strong>: Glue automatically tracks and catalogs data in S3, detecting and adapting schema changes. This avoids possible errors and allows the reading of new fields in case of alterations in the event schemas.<\/span><\/li><\/ul>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-ec17b62 elementor-widget elementor-widget-heading\" data-id=\"ec17b62\" data-element_type=\"widget\" id=\"clonacion\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">Hudi configuration\n<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-3f77738 elementor-widget elementor-widget-text-editor\" data-id=\"3f77738\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">It is important to understand the configurations offered by Hudi to optimize our application, in particular for a Near Real Time application it is convenient to be aware of the available options. Although the configuration capacity is immense <a href=\"#referencias\">[<\/a><\/span><span style=\"font-weight: 400;\">1<\/span><span style=\"font-weight: 400;\">], <\/span><span style=\"font-weight: 400;\">we will try to summarize the most relevant ones for a first contact with this technology.<\/span><\/p><h3><span style=\"font-weight: 400;\">Partitioning<\/span><\/h3><p><span style=\"font-weight: 400;\">Apache Hudi offers the types of partitioning that can be found in other solutions, the main ones will be detailed and the implemented one will be justified:<\/span><span style=\"font-weight: 400;\"><br \/><\/span><\/p><ul><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Simple<\/b><span style=\"font-weight: 400;\">: partitioning based on a single field, in this case the field chosen is &#8216;ticker&#8217; as it has been identified as the one with the lowest cardinality.<\/span><\/li><\/ul><ul><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Compound Partitioning<\/b><span style=\"font-weight: 400;\">: partitioning based on multiple fields, it could be interesting to choose a low cardinality field (ticker) and a medium cardinality field (date).<\/span><\/li><\/ul><ul><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Dynamic Partitioning<\/b><span style=\"font-weight: 400;\">: choice of the variable based on the values, it can be interesting when the cardinality of the variables can undergo variations and an update of the partitioning is required in an automatic and flexible way.<\/span><\/li><\/ul><h3><span style=\"font-weight: 400;\"><br \/>Indexes<\/span><\/h3><p><span style=\"font-weight: 400;\">Apache Hudi has multiple types of indexing <\/span><span style=\"font-weight: 400;\"><a href=\"#referencias\">[<\/a><\/span><span style=\"font-weight: 400;\">2<\/span><span style=\"font-weight: 400;\">], we will briefly discuss the most common ones:<\/span><\/p><ul><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Bloom Index <\/b><span style=\"font-weight: 400;\">&#8211; Makes use of a bloom filter on the key of the events, additionally it can be complemented with a filtering by key range. It works well when dealing with a table where most changes occur in the most recent partitions or for event deduplication.<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Simple<\/b><span style=\"font-weight: 400;\">: indexing performed by the combination of FileID and RecordKey. Recommended when Upsert operations are not so frequent due to the simplicity it offers.<\/span><\/li><\/ul><p><span style=\"font-weight: 400;\">Both types of indexes can be used in their global form<\/span><\/p><ul><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Global index <\/b><span style=\"font-weight: 400;\">&#8211; They impose the uniqueness of the keys in all the partitions of the table, that is to say, they guarantee that there will be only one record with a certain key.<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Non-global index <\/b><span style=\"font-weight: 400;\">&#8211; Key uniqueness is only required at the partition level. If the data is consistent and a key is only going to exist in one partition, this type of index offers much better performance and better scaling.<\/span><\/li><\/ul><p><span style=\"font-weight: 400;\">In this case, a Bloom Index has been chosen, which is the default in case it is not expressly stated:<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-86d3821 elementor-widget elementor-widget-elementor-syntax-highlighter\" data-id=\"86d3821\" data-element_type=\"widget\" data-widget_type=\"elementor-syntax-highlighter.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<pre><code class='language-python'>&quot;hoodie.index.type&quot; = &quot;BLOOM&quot; <\/code><\/pre><script>\nif (!document.getElementById('syntaxed-prism')) {\n\tvar my_awesome_script = document.createElement('script');\n\tmy_awesome_script.setAttribute('src','https:\/\/bluetab.es\/wp-content\/plugins\/syntax-highlighter-for-elementor\/assets\/prism2.js');\n\tmy_awesome_script.setAttribute('id','syntaxed-prism');\n\tdocument.body.appendChild(my_awesome_script);\n} else {\n\twindow.Prism && Prism.highlightAll();\n}\n<\/script>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-7629201 elementor-widget elementor-widget-text-editor\" data-id=\"7629201\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">The choice of this type of indexing is due to the fact that the use cases that have been raised require a considerably high and efficient data processing.<\/span><\/p><h3><span style=\"font-weight: 400;\">Types of operations<\/span><\/h3><p><span style=\"font-weight: 400;\">Apache Hudi offers several types of operations <\/span><span style=\"font-weight: 400;\"><a href=\"#referencias\">[<\/a><\/span><span style=\"font-weight: 400;\">3<\/span><span style=\"font-weight: 400;\">] that allow users to manage and modify large data sets. The main operations performed in Stress Tests as well as in other scenarios are detailed below: <\/span><\/p><ul><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Upsert &#8211; <\/b><span style=\"font-weight: 400;\">This is the default operation, and will execute an insert or an update depending on whether the record already exists after an index lookup. With this operation the table will have no duplicates for its primary key.<\/span><\/li><li aria-level=\"1\"><b>Insert &#8211; <\/b><span style=\"font-weight: 400;\">This operation ignores the index lookup when inserting events. It is the fastest but the table may contain duplicates. It is still useful if auxiliary deduplication methods are used, or simply the existence of these is tolerable in the use case.<\/span><\/li><li aria-level=\"1\"><b>Delete<\/b><span style=\"font-weight: 400;\">: Hudi offers two deletion methods. Soft Delete converts to null the values of the event except for the key. Hard Delete executes a physical deletion of the event in the table.<\/span><\/li><li aria-level=\"1\"><b>Bulk Insert <\/b><span style=\"font-weight: 400;\">Operation similar to Insert but optimized for insertion of a large volume of data, at the cost of sacrificing some guarantees in file size control. Scales well for hundreds of TBs in case of initial bootstrap of a large table.<\/span><\/li><\/ul><h3><span style=\"font-weight: 400;\">Compaction<\/span><\/h3><p><span style=\"font-weight: 400;\">In the case of using a MoR table, it is possible to configure the log compaction rate to find the balance between write and read latency that best suits the use case. It is possible to specify a strategy of time or number of delta commits (or both) that execute a compaction process:<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-544fc8c elementor-widget elementor-widget-elementor-syntax-highlighter\" data-id=\"544fc8c\" data-element_type=\"widget\" data-widget_type=\"elementor-syntax-highlighter.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<pre><code class='language-python'>compaction.delta_commits\r\ncompaction.delta_seconds\r\ncompaction.trigger.strategy <\/code><\/pre><script>\nif (!document.getElementById('syntaxed-prism')) {\n\tvar my_awesome_script = document.createElement('script');\n\tmy_awesome_script.setAttribute('src','https:\/\/bluetab.es\/wp-content\/plugins\/syntax-highlighter-for-elementor\/assets\/prism2.js');\n\tmy_awesome_script.setAttribute('id','syntaxed-prism');\n\tdocument.body.appendChild(my_awesome_script);\n} else {\n\twindow.Prism && Prism.highlightAll();\n}\n<\/script>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-624c648 elementor-widget elementor-widget-text-editor\" data-id=\"624c648\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<h3><span style=\"font-weight: 400;\">Asynchronous actions<\/span><\/h3><p><span style=\"font-weight: 400;\">Certain timeline actions such as compacting, cleaning, archiving and clustering can be performed asynchronously by the application, or even relegated to auxiliary processes to the writing application. In the case of Flink, it can help improve write latency and avoid BackPressure problems in the application:<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-48408c0 elementor-widget elementor-widget-elementor-syntax-highlighter\" data-id=\"48408c0\" data-element_type=\"widget\" data-widget_type=\"elementor-syntax-highlighter.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<pre><code class='language-python'>compaction.async.enabled\r\nhoodie.clean.async\r\nhoodie.archive.async\r\nhoodie.clustering.async.enabled <\/code><\/pre><script>\nif (!document.getElementById('syntaxed-prism')) {\n\tvar my_awesome_script = document.createElement('script');\n\tmy_awesome_script.setAttribute('src','https:\/\/bluetab.es\/wp-content\/plugins\/syntax-highlighter-for-elementor\/assets\/prism2.js');\n\tmy_awesome_script.setAttribute('id','syntaxed-prism');\n\tdocument.body.appendChild(my_awesome_script);\n} else {\n\twindow.Prism && Prism.highlightAll();\n}\n<\/script>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-7b2b8ef elementor-widget elementor-widget-heading\" data-id=\"7b2b8ef\" data-element_type=\"widget\" id=\"conclusiones\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">Stress Tests &amp; Insights\n<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-885b490 elementor-widget elementor-widget-text-editor\" data-id=\"885b490\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">When deploying the applications, different tests have been performed, varying both the maximum load of events and the concurrency and exponential degree of growth of the same. This has been possible thanks to the flexibility offered by Locust being built on a Kubernetes cluster, being able to set a maximum limit of concurrency of events and an incremental of them. In the tests, a maximum limit of 5 to 15K simultaneous users (Peak Concurrency) has been established, scaling the frequency of the same in a linear way, from 5 to 20 more users per second (Spawn Rate):<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-b85548e elementor-widget elementor-widget-image\" data-id=\"b85548e\" data-element_type=\"widget\" data-widget_type=\"image.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<img decoding=\"async\" width=\"1024\" height=\"383\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image12-1024x383.png\" class=\"attachment-large size-large wp-image-15658 lazyload\" alt=\"\" data-srcset=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image12-1024x383.png 1024w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image12-300x112.png 300w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image12-768x288.png 768w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/image12.png 1167w\" data-sizes=\"(max-width: 1024px) 100vw, 1024px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 1024px; --smush-placeholder-aspect-ratio: 1024\/383;\" \/>\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-10777c1 elementor-widget elementor-widget-text-editor\" data-id=\"10777c1\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">The different tests have been monitored in order to draw conclusions about the performance, taking into account the specific characteristics of each of the formats. The metrics on which the analyses have been based are both the native CloudWatch Metrics (CPU &amp; Memory Utilization, KPUs, LastCheckpoint SIze &amp; Duration,&#8230;), as well as the metrics obtained from the Lambdas that periodically consult the number of events available in the buckets and calculate the average latency of the same.<\/span><\/p><h3><span style=\"font-weight: 400;\"><br \/>Number of Events<\/span><\/h3>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-1691708 elementor-widget elementor-widget-image\" data-id=\"1691708\" data-element_type=\"widget\" data-widget_type=\"image.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<img decoding=\"async\" width=\"1000\" height=\"135\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/low5.jpg\" class=\"attachment-large size-large wp-image-15769 lazyload\" alt=\"\" data-srcset=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/low5.jpg 1000w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/low5-300x41.jpg 300w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/low5-768x104.jpg 768w\" data-sizes=\"(max-width: 1000px) 100vw, 1000px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 1000px; --smush-placeholder-aspect-ratio: 1000\/135;\" \/>\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-ddb9efb elementor-widget__width-initial elementor-widget elementor-widget-text-editor\" data-id=\"ddb9efb\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">When analyzing the total number of events processed, which are sent gradually, i.e., as time passes more and more events are sent per second, a fairly similar trend is identified although JSON and Hudi MoR stand out over Hudi CoW in terms of performance. It is worth noting that JSON shows a more stable and steady growth compared to Hudi MoR and CoW and this is because the latter are able to handle incremental updates in the data.<\/span><\/p><p><span style=\"font-weight: 400;\">The similarity between JSON and Hudi MoR makes the choice entirely based on the characteristics of the project. In case the data is not updated JSON may be a more interesting solution mainly due to its simplicity, while if there is a high frequency of historical data update, Hudi MoR may be a better solution. This is due both to the higher efficiency in reading tasks and because of the possibility to record different versions of the data.<\/span><\/p><h3><span style=\"font-weight: 400;\">Latency<\/span><\/h3><p><span style=\"font-weight: 400;\">Due to the difficulty of standardizing the latency calculation logic between 3 different types of storage, we have chosen to simplify it by calculating it as the difference between the time of event creation and the time of processing in the respective application.<\/span><\/p><h3><span style=\"font-weight: 400;\">\u00a0<\/span><\/h3>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-9e8274c elementor-widget elementor-widget-image\" data-id=\"9e8274c\" data-element_type=\"widget\" data-widget_type=\"image.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<img decoding=\"async\" width=\"1000\" height=\"135\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new4.jpg\" class=\"attachment-large size-large wp-image-15785 lazyload\" alt=\"\" data-srcset=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new4.jpg 1000w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new4-300x41.jpg 300w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new4-768x104.jpg 768w\" data-sizes=\"(max-width: 1000px) 100vw, 1000px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 1000px; --smush-placeholder-aspect-ratio: 1000\/135;\" \/>\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-481b842 elementor-widget elementor-widget-text-editor\" data-id=\"481b842\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">Similar behavior is observed between JSON and Hudi MoR, although the former in a more critical way, having a very low initial latency but as both processing time and load volume increases, this latency is negatively affected.<\/span><\/p><p><span style=\"font-weight: 400;\">The choice between JSON and Hudi MoR will depend both on the fault tolerance of the application and the characteristics of each of the formats, in case the data structure is stable and does not change frequently, or does not depend on incremental updates and can deal with complete rewrites, then JSON may be a better choice.<\/span><\/p><p><span style=\"font-weight: 400;\">The choice of Hudi CoW over MoR can be made when high error tolerance and high recoverability from failed or corrupted write events are required.<\/span><\/p><h3><span style=\"font-weight: 400;\"><br \/>CPU utilization<\/span><\/h3>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-363e3ae elementor-widget elementor-widget-image\" data-id=\"363e3ae\" data-element_type=\"widget\" data-widget_type=\"image.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<img decoding=\"async\" width=\"1000\" height=\"135\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new3.jpg\" class=\"attachment-large size-large wp-image-15787 lazyload\" alt=\"\" data-srcset=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new3.jpg 1000w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new3-300x41.jpg 300w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new3-768x104.jpg 768w\" data-sizes=\"(max-width: 1000px) 100vw, 1000px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 1000px; --smush-placeholder-aspect-ratio: 1000\/135;\" \/>\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-b0d410b elementor-widget elementor-widget-text-editor\" data-id=\"b0d410b\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">When analyzing CPU usage, a certain homogeneity has been identified among the different tests, even when working with different workloads. JSON and Hudi MoR stand out for having the lowest CPU usage levels, both for different reasons. JSON stands out for its simplicity by directly including the new data without having to deal with data versioning, while MoR does not consume as much CPU since, due to its characteristics, the highest CPU consumption is made when performing read queries, in the write tasks it only identifies the changes that will be applied when querying them.<\/span><\/p><p><span style=\"font-weight: 400;\">Remember that CloudWatch native metrics only allow us to monitor the applications, which correspond to the writing tasks. The monitoring of read tasks corresponds to the Lambdas mentioned above.\u00a0<\/span><\/p><p><span style=\"font-weight: 400;\">In this case MoR is more beneficial with respect to CoW, since the higher CPU consumption in MoR occurs when querying the stored data while in CoW it occurs when updating the data.<\/span><\/p><p><span style=\"font-weight: 400;\">The choice between the most efficient formats depends on the needs of the project, in case a higher fault tolerance, data versioning and higher reading efficiency are required, MoR will be chosen over JSON, between the two Hudi formats, again, the choice will depend on the characteristics of the project, if the queries require heavy and\/or complex transformations, MoR would be chosen; if, on the other hand, the project requires greater data integrity and\/or the data ingestion is in batch, CoW would be more interesting because when working with these volumes of data, having backup copies, in case of errors, the impact in terms of costs and recovery time is lower.<\/span><\/p><h3><span style=\"font-weight: 400;\"><br \/>Memory Utilization<\/span><\/h3>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-c72e45f elementor-widget elementor-widget-image\" data-id=\"c72e45f\" data-element_type=\"widget\" data-widget_type=\"image.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<img decoding=\"async\" width=\"1000\" height=\"135\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new2.jpg\" class=\"attachment-large size-large wp-image-15789 lazyload\" alt=\"\" data-srcset=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new2.jpg 1000w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new2-300x41.jpg 300w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new2-768x104.jpg 768w\" data-sizes=\"(max-width: 1000px) 100vw, 1000px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 1000px; --smush-placeholder-aspect-ratio: 1000\/135;\" \/>\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-42a72e3 elementor-widget elementor-widget-text-editor\" data-id=\"42a72e3\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">JSON again stands out for having the lowest memory usage values, although for the number of transformations that are performed, they are relatively high, especially considering that it does not have to deal with version management or data merging. These values are due to the fact that it does not have optimized compression capabilities or efficient schema management.<\/span><\/p><p><span style=\"font-weight: 400;\">Regarding Hudi, similar conclusions can be drawn as in the CPU usage section, MoR has a higher memory utilization than JSON due to delta log processing and version management and a lower one to CoW since the actual data consolidation does not occur during writing.<\/span><\/p><p>\u00a0<\/p><h3><span style=\"font-weight: 400;\">Last Checkpoint Size<\/span><\/h3>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-5db63db elementor-widget elementor-widget-image\" data-id=\"5db63db\" data-element_type=\"widget\" data-widget_type=\"image.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<img decoding=\"async\" width=\"1000\" height=\"135\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new1.jpg\" class=\"attachment-large size-large wp-image-15791 lazyload\" alt=\"\" data-srcset=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new1.jpg 1000w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new1-300x41.jpg 300w, https:\/\/www.bluetab.net\/wp-content\/uploads\/2023\/10\/new1-768x104.jpg 768w\" data-sizes=\"(max-width: 1000px) 100vw, 1000px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 1000px; --smush-placeholder-aspect-ratio: 1000\/135;\" \/>\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-649700c elementor-widget elementor-widget-text-editor\" data-id=\"649700c\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">It is important to highlight, once again, the stability of JSON compared to Hudi applications, since it not only shows a lower value than both in the tests performed, but also a stability that is not achieved with either MoR or CoW, since, as can be seen, when monitoring the size of the Checkpoints, considerable volatility is perceived.<\/span><\/p><p><span style=\"font-weight: 400;\">Perceived volatility in Hudi applications is mainly due to Checkpoint failures, which leads to a larger Checkpoint volume after the failure. In addition to this, the volatility in Checkpoint sizes may be related to the optimization and compaction operations performed internally that may lead to state compaction, which considerably reduces the size of the Checkpoint.<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-45e03ee elementor-widget elementor-widget-heading\" data-id=\"45e03ee\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">Development challenges\n<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-e06b516 elementor-widget elementor-widget-text-editor\" data-id=\"e06b516\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<h3><span style=\"font-weight: 400;\">Read Throughput of Kinesis and EFO<\/span><\/h3><p><span style=\"font-weight: 400;\">In order not to exceed the read limit on the Kinesis Stream we have chosen to subscribe the consumers as Enhanced Fan-Out. In some tests in conjunction with Autoscaling this has given problems with the Flink Kinesis connector being unable to close connections when scaling the cluster.<\/span><\/p><h3><span style=\"font-weight: 400;\">Hudi configuration<\/span><\/h3><p><span style=\"font-weight: 400;\">Hudi&#8217;s configuration has been another sticking point during development. Under high loads the compaction and cleanup processes are more likely to cause backpressure problems and cause application errors. Although configuring these processes to occur asynchronously can alleviate this problem, conflicts and misalignment between processes can arise under high loads. A balance between these configurations and the application&#8217;s cluster capacity are key to the smooth operation of the application.<\/span><\/p><h3><span style=\"font-weight: 400;\">Format heterogeneity<\/span><\/h3><p><span style=\"font-weight: 400;\">When analyzing the performance of the 3 applications, there is an additional difficulty due to the nature of the format types, which has an impact both on the architecture and on the development of the logics.<\/span><span style=\"font-weight: 400;\"><br \/><\/span><span style=\"font-weight: 400;\"><br \/><\/span><span style=\"font-weight: 400;\">The different behavior of the formats in the ingest complicates the development oflogics when calculating latency. MoR writes to logs after compaction, so the data is not immediately available as is the case with CoW or JSON.\u00a0 This implies that the common measurable metric for all formats is read availability, which is not the main purpose of a MoR table.\u00a0\u00a0<\/span><\/p><h3><span style=\"font-weight: 400;\">Synchronization with the Glue Catalog<\/span><\/h3><p><span style=\"font-weight: 400;\">One of the great advantages we have found with Hudi is its ability to synchronize with the Glue catalog, creating the tables and keeping them updated without the need for a crawler. This allows for a cleaner application and architecture than in the case of JSON, for which it must be run manually when deploying applications.<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-7739c9b elementor-widget elementor-widget-heading\" data-id=\"7739c9b\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">Conclusions<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-c8a87a1 elementor-widget elementor-widget-text-editor\" data-id=\"c8a87a1\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">The test results show considerable <\/span><b>differences between the JSON, Hudi MoR and CoW formats in terms of efficiency<\/b><span style=\"font-weight: 400;\">, responsiveness and resource utilization. We proceed to analyze each of the aspects in more detail:<\/span><span style=\"font-weight: 400;\"><br \/><\/span><\/p><ul><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Processing Efficiency<\/b><span style=\"font-weight: 400;\">: JSON and Hudi MoR stand out in most metrics, showing optimal performance in terms of Latency, CPU &amp; Memory Utilization. However, JSON behavior is more stable and predictable, although MoR has advantages over JSON, for example, in incremental update management.<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Resilience and Fault Tolerance<\/b><span style=\"font-weight: 400;\">: fault tolerance is a very important factor in the decision on the choice between Hudi and JSON. In the case of MoR and CoW, it will depend on the degree of criticality, since at a general level the performance in writing tasks for MoR is superior.<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Resource Usage<\/b><span style=\"font-weight: 400;\">: JSON is shown to be the most lightweight, with low CPU and memory utilization, due to its inherent simplicity. Whereas Hudi MoR and CoW, due to the nature of their design and data management, require more resources, especially in operations involving version management and data compaction.<\/span><\/li><\/ul><p><span style=\"font-weight: 400;\">Finally, it is interesting to identify in which <\/span><b>use cases or projects each of the formats may be more recommendable<\/b><span style=\"font-weight: 400;\"> depending on their characteristics and the network flags that may be established:<\/span><span style=\"font-weight: 400;\"><br \/><\/span><\/p><ul><li style=\"font-weight: 400;\" aria-level=\"1\"><b>JSON<\/b><span style=\"font-weight: 400;\">: Recommended for applications with stable data structures that do not require incremental updates and where simplicity and stability are key.<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Hudi MoR<\/b><span style=\"font-weight: 400;\">: Suitable for projects that require efficient management of incremental updates and where latency and writing efficiency are crucial.<\/span><\/li><li style=\"font-weight: 400;\" aria-level=\"1\"><b>Hudi CoW<\/b><span style=\"font-weight: 400;\">: Ideal for contexts where data integrity is essential, and robust error recovery is needed, especially in batch ingest scenarios.\u00a0<\/span><\/li><\/ul>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-30687e6 elementor-widget elementor-widget-spacer\" data-id=\"30687e6\" data-element_type=\"widget\" id=\"ref\" data-widget_type=\"spacer.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t<div class=\"elementor-spacer\">\n\t\t\t<div class=\"elementor-spacer-inner\"><\/div>\n\t\t<\/div>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-239ebd9 elementor-widget elementor-widget-heading\" data-id=\"239ebd9\" data-element_type=\"widget\" id=\"referencias\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">References<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-e34c034 elementor-widget elementor-widget-text-editor\" data-id=\"e34c034\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">[1] Hudi Tables Configuration. [<\/span><a href=\"https:\/\/hudi.apache.org\/docs\/next\/configurations\/\"><span style=\"font-weight: 400;\">link<\/span><\/a><span style=\"font-weight: 400;\">]<\/span><\/p><p><span style=\"font-weight: 400;\">[2] Index Types in Hudi. [<\/span><a href=\"https:\/\/hudi.apache.org\/docs\/next\/indexing\/#index-types-in-hudi\"><span style=\"font-weight: 400;\">link<\/span><\/a><span style=\"font-weight: 400;\">]<\/span><\/p><p><span style=\"font-weight: 400;\">[3] Hudi Operation Types. [<\/span><a href=\"https:\/\/hudi.apache.org\/docs\/write_operations\/\"><span style=\"font-weight: 400;\">link<\/span><\/a><span style=\"font-weight: 400;\">]<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-ec110af elementor-widget elementor-widget-heading\" data-id=\"ec110af\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">Autores<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-ede07de elementor-position-left elementor-vertical-align-middle elementor-widget elementor-widget-image-box\" data-id=\"ede07de\" data-element_type=\"widget\" data-widget_type=\"image-box.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<div class=\"elementor-image-box-wrapper\"><figure class=\"elementor-image-box-img\"><a href=\"https:\/\/www.linkedin.com\/in\/albertojaenrevuelta\/\" target=\"_blank\" tabindex=\"-1\"><img decoding=\"async\" width=\"150\" height=\"150\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-150x150.png\" class=\"attachment-thumbnail size-thumbnail wp-image-13320 lazyload\" alt=\"\" data-srcset=\"https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-150x150.png 150w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-300x300.png 300w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-75x75.png 75w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-270x270.png 270w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-192x192.png 192w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-180x180.png 180w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-32x32.png 32w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab.png 512w\" data-sizes=\"(max-width: 150px) 100vw, 150px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 150px; --smush-placeholder-aspect-ratio: 150\/150;\" \/><\/a><\/figure><div class=\"elementor-image-box-content\"><h4 class=\"elementor-image-box-title\"><a href=\"https:\/\/www.linkedin.com\/in\/albertojaenrevuelta\/\" target=\"_blank\">Alberto Jaen<\/a><\/h4><p class=\"elementor-image-box-description\">AWS Cloud Engineer <\/p><\/div><\/div>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-ac9bbd1 elementor-widget elementor-widget-text-editor\" data-id=\"ac9bbd1\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">I started my career with the development, maintenance and administration of multidimensional databases and Data Lakes. From there I started to be interested in data platforms and cloud architectures, being certified 3 times in AWS and 2 with Hashicorp.<\/span><\/p><p><span style=\"font-weight: 400;\">I am currently working as a Cloud Engineer developing Data Lakes and DataWarehouses with AWS for a client related to the organization of sporting events worldwide.<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-8441f44 elementor-position-left elementor-vertical-align-middle elementor-widget elementor-widget-image-box\" data-id=\"8441f44\" data-element_type=\"widget\" data-widget_type=\"image-box.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<div class=\"elementor-image-box-wrapper\"><figure class=\"elementor-image-box-img\"><a href=\"https:\/\/www.linkedin.com\/in\/alfonsojerezizquierdo\/\" target=\"_blank\" tabindex=\"-1\"><img decoding=\"async\" width=\"150\" height=\"150\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-150x150.png\" class=\"attachment-thumbnail size-thumbnail wp-image-13320 lazyload\" alt=\"\" data-srcset=\"https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-150x150.png 150w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-300x300.png 300w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-75x75.png 75w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-270x270.png 270w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-192x192.png 192w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-180x180.png 180w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-32x32.png 32w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab.png 512w\" data-sizes=\"(max-width: 150px) 100vw, 150px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 150px; --smush-placeholder-aspect-ratio: 150\/150;\" \/><\/a><\/figure><div class=\"elementor-image-box-content\"><h4 class=\"elementor-image-box-title\"><a href=\"https:\/\/www.linkedin.com\/in\/alfonsojerezizquierdo\/\" target=\"_blank\">Alfonso Jerez<\/a><\/h4><p class=\"elementor-image-box-description\">AWS Cloud Engineer <\/p><\/div><\/div>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-863802c elementor-widget elementor-widget-text-editor\" data-id=\"863802c\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">Passionate about data and new technologies, specialized as AWS Cloud Engineer in DataWarehouses optimization and Data Lakes ingestion and transformation processes. Motivated by continuous improvement and automation of service integration.<\/span><\/p><p><span style=\"font-weight: 400;\">Actively collaborating with the Cloud Practice group in research and blog development of cutting-edge and innovative technologies such as this one, thus fostering continuous learning.<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-c2a26f7 elementor-position-left elementor-vertical-align-middle elementor-widget elementor-widget-image-box\" data-id=\"c2a26f7\" data-element_type=\"widget\" data-widget_type=\"image-box.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<div class=\"elementor-image-box-wrapper\"><figure class=\"elementor-image-box-img\"><a href=\"https:\/\/www.linkedin.com\/in\/adrianjimenezhernandez\/\" target=\"_blank\" tabindex=\"-1\"><img decoding=\"async\" width=\"150\" height=\"150\" data-src=\"https:\/\/www.bluetab.net\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-150x150.png\" class=\"attachment-thumbnail size-thumbnail wp-image-13320 lazyload\" alt=\"\" data-srcset=\"https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-150x150.png 150w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-300x300.png 300w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-75x75.png 75w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-270x270.png 270w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-192x192.png 192w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-180x180.png 180w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab-32x32.png 32w, https:\/\/bluetab.es\/wp-content\/uploads\/2022\/03\/cropped-Isotipo-Bluetab.png 512w\" data-sizes=\"(max-width: 150px) 100vw, 150px\" src=\"data:image\/svg+xml;base64,PHN2ZyB3aWR0aD0iMSIgaGVpZ2h0PSIxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\" style=\"--smush-placeholder-width: 150px; --smush-placeholder-aspect-ratio: 150\/150;\" \/><\/a><\/figure><div class=\"elementor-image-box-content\"><h4 class=\"elementor-image-box-title\"><a href=\"https:\/\/www.linkedin.com\/in\/adrianjimenezhernandez\/\" target=\"_blank\">Adri\u00e1n Jim\u00e9nez<\/a><\/h4><p class=\"elementor-image-box-description\">AWS Cloud Engineer <\/p><\/div><\/div>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-4e0baea elementor-widget elementor-widget-text-editor\" data-id=\"4e0baea\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><span style=\"font-weight: 400;\">Dedicated to constantly learning new technologies and their application, enjoying using them to solve technological challenges. I develop my career as a Cloud Engineer designing, implementing and maintaining infrastructure in AWS.<\/span><\/p><p><span style=\"font-weight: 400;\">I actively collaborate in the Cloud Practice, where we research and experiment with new technologies, seeking solutions to the challenges faced by our clients.<\/span><\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-65c3381 elementor-widget elementor-widget-spacer\" data-id=\"65c3381\" data-element_type=\"widget\" data-widget_type=\"spacer.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t<div class=\"elementor-spacer\">\n\t\t\t<div class=\"elementor-spacer-inner\"><\/div>\n\t\t<\/div>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-inner-column elementor-element elementor-element-f668113 nav-column elementor-hidden-phone\" data-id=\"f668113\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-d289a2d elementor-widget elementor-widget-table-of-contents\" data-id=\"d289a2d\" data-element_type=\"widget\" data-settings=\"{&quot;headings_by_tags&quot;:[&quot;h2&quot;],&quot;exclude_headings_by_selector&quot;:[],&quot;sticky&quot;:&quot;top&quot;,&quot;sticky_offset&quot;:150,&quot;sticky_parent&quot;:&quot;yes&quot;,&quot;marker_view&quot;:&quot;numbers&quot;,&quot;no_headings_message&quot;:&quot;No headings were found on this page.&quot;,&quot;hierarchical_view&quot;:&quot;yes&quot;,&quot;min_height&quot;:{&quot;unit&quot;:&quot;px&quot;,&quot;size&quot;:&quot;&quot;,&quot;sizes&quot;:[]},&quot;min_height_tablet&quot;:{&quot;unit&quot;:&quot;px&quot;,&quot;size&quot;:&quot;&quot;,&quot;sizes&quot;:[]},&quot;min_height_mobile&quot;:{&quot;unit&quot;:&quot;px&quot;,&quot;size&quot;:&quot;&quot;,&quot;sizes&quot;:[]},&quot;sticky_on&quot;:[&quot;desktop&quot;,&quot;tablet&quot;,&quot;mobile&quot;],&quot;sticky_effects_offset&quot;:0,&quot;sticky_anchor_link_offset&quot;:0}\" data-widget_type=\"table-of-contents.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<div class=\"elementor-toc__header\">\n\t\t\t\t\t\t<h4 class=\"elementor-toc__header-title\">\n\t\t\t\tNavigation\t\t\t<\/h4>\n\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<div id=\"elementor-toc__d289a2d\" class=\"elementor-toc__body\">\n\t\t\t<div class=\"elementor-toc__spinner-container\">\n\t\t\t\t<i class=\"elementor-toc__spinner eicon-animation-spin eicon-loading\" aria-hidden=\"true\"><\/i>\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t<section class=\"elementor-section elementor-top-section elementor-element elementor-element-09395a3 elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"09395a3\" data-element_type=\"section\" id=\"autores\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-100 elementor-top-column elementor-element elementor-element-2ba3c08\" data-id=\"2ba3c08\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<section class=\"elementor-section elementor-inner-section elementor-element elementor-element-7a7e9b1 elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"7a7e9b1\" data-element_type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-inner-column elementor-element elementor-element-0225831\" data-id=\"0225831\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap\">\n\t\t\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-inner-column elementor-element elementor-element-816937f\" data-id=\"816937f\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-756365e elementor-share-buttons--view-icon elementor-share-buttons--skin-minimal elementor-share-buttons--shape-circle elementor-grid-0 elementor-share-buttons--color-official elementor-widget elementor-widget-share-buttons\" data-id=\"756365e\" data-element_type=\"widget\" data-widget_type=\"share-buttons.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t<div class=\"elementor-grid\" role=\"list\">\n\t\t\t\t\t\t\t\t<div class=\"elementor-grid-item\" role=\"listitem\">\n\t\t\t\t\t\t<div class=\"elementor-share-btn elementor-share-btn_twitter\" role=\"button\" tabindex=\"0\" aria-label=\"Share on twitter\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<span class=\"elementor-share-btn__icon\">\n\t\t\t\t\t\t\t\t<i class=\"fab fa-twitter\" aria-hidden=\"true\"><\/i>\t\t\t\t\t\t\t<\/span>\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t\t\t\t\t\t\t\t<div class=\"elementor-grid-item\" role=\"listitem\">\n\t\t\t\t\t\t<div class=\"elementor-share-btn elementor-share-btn_linkedin\" role=\"button\" tabindex=\"0\" aria-label=\"Share on linkedin\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<span class=\"elementor-share-btn__icon\">\n\t\t\t\t\t\t\t\t<i class=\"fab fa-linkedin\" aria-hidden=\"true\"><\/i>\t\t\t\t\t\t\t<\/span>\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t<section class=\"elementor-section elementor-inner-section elementor-element elementor-element-f18d1b8 elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"f18d1b8\" data-element_type=\"section\" data-settings=\"{&quot;background_background&quot;:&quot;classic&quot;}\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-100 elementor-inner-column elementor-element elementor-element-b490454\" data-id=\"b490454\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-a76f07c elementor-widget elementor-widget-heading\" data-id=\"a76f07c\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h5 class=\"elementor-heading-title elementor-size-default\">\u00bfQuieres saber m\u00e1s de lo que ofrecemos y ver otros casos de \u00e9xito?<\/h5>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-7365ecc elementor-align-center elementor-widget elementor-widget-button\" data-id=\"7365ecc\" data-element_type=\"widget\" data-widget_type=\"button.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<div class=\"elementor-button-wrapper\">\n\t\t\t\t\t<a class=\"elementor-button elementor-button-link elementor-size-sm\" href=\"\/es\/\">\n\t\t\t\t\t\t<span class=\"elementor-button-content-wrapper\">\n\t\t\t\t\t\t\t\t\t<span class=\"elementor-button-text\">DESCUBRE BLUETAB<\/span>\n\t\t\t\t\t<\/span>\n\t\t\t\t\t<\/a>\n\t\t\t\t<\/div>\n\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t<div class=\"elementor-element elementor-element-1ec3a45 elementor-widget elementor-widget-spacer\" data-id=\"1ec3a45\" data-element_type=\"widget\" data-widget_type=\"spacer.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t<div class=\"elementor-spacer\">\n\t\t\t<div class=\"elementor-spacer-inner\"><\/div>\n\t\t<\/div>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t<section class=\"elementor-section elementor-top-section elementor-element elementor-element-e6097d2 elementor-section-full_width elementor-section-height-default elementor-section-height-default\" data-id=\"e6097d2\" data-element_type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-top-column elementor-element elementor-element-a66a650\" data-id=\"a66a650\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-38e61b8 elementor-widget elementor-widget-text-editor\" data-id=\"38e61b8\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p><b>SOLUCIONES, <\/b>SOMOS EXPERTOS<\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<section class=\"elementor-section elementor-inner-section elementor-element elementor-element-9975abb elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"9975abb\" data-element_type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-33 elementor-inner-column elementor-element elementor-element-e5edefb\" data-id=\"e5edefb\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-0628f91 elementor-cta--skin-cover elementor-cta--valign-middle elementor-animated-content elementor-bg-transform elementor-bg-transform-zoom-in elementor-widget elementor-widget-call-to-action\" data-id=\"0628f91\" data-element_type=\"widget\" data-widget_type=\"call-to-action.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t<a class=\"elementor-cta\" href=\"\/es\/soluciones\/data-strategy\/\">\n\t\t\t\t\t<div class=\"elementor-cta__bg-wrapper\">\n\t\t\t\t<div class=\"elementor-cta__bg elementor-bg lazyload\" style=\"background-image:inherit;\" role=\"img\" aria-label=\"strategy-opt\" data-bg-image=\"url(https:\/\/www.bluetab.net\/wp-content\/uploads\/2020\/10\/strategy-opt.jpg)\"><\/div>\n\t\t\t\t<div class=\"elementor-cta__bg-overlay\"><\/div>\n\t\t\t<\/div>\n\t\t\t\t\t\t\t<div class=\"elementor-cta__content\">\n\t\t\t\t\n\t\t\t\t\t\t\t\t\t<h5 class=\"elementor-cta__title elementor-cta__content-item elementor-content-item elementor-animated-item--grow\">\n\t\t\t\t\t\tDATA STRATEGY\t\t\t\t\t<\/h5>\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\t\t<\/div>\n\t\t\t\t\t\t<\/a>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t<div class=\"elementor-column elementor-col-33 elementor-inner-column elementor-element elementor-element-8724c63\" data-id=\"8724c63\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-bc8b25d elementor-cta--skin-cover elementor-cta--valign-middle elementor-animated-content elementor-bg-transform elementor-bg-transform-zoom-in elementor-widget elementor-widget-call-to-action\" data-id=\"bc8b25d\" data-element_type=\"widget\" data-widget_type=\"call-to-action.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t<a class=\"elementor-cta\" href=\"\/es\/soluciones\/data-fabric\/\">\n\t\t\t\t\t<div class=\"elementor-cta__bg-wrapper\">\n\t\t\t\t<div class=\"elementor-cta__bg elementor-bg lazyload\" style=\"background-image:inherit;\" role=\"img\" aria-label=\"fabric-opt\" data-bg-image=\"url(https:\/\/www.bluetab.net\/wp-content\/uploads\/2020\/10\/fabric-opt.jpg)\"><\/div>\n\t\t\t\t<div class=\"elementor-cta__bg-overlay\"><\/div>\n\t\t\t<\/div>\n\t\t\t\t\t\t\t<div class=\"elementor-cta__content\">\n\t\t\t\t\n\t\t\t\t\t\t\t\t\t<h5 class=\"elementor-cta__title elementor-cta__content-item elementor-content-item elementor-animated-item--grow\">\n\t\t\t\t\t\tDATA FABRIC\t\t\t\t\t<\/h5>\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\t\t<\/div>\n\t\t\t\t\t\t<\/a>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t<div class=\"elementor-column elementor-col-33 elementor-inner-column elementor-element elementor-element-ddc996d\" data-id=\"ddc996d\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-1d384cf elementor-cta--skin-cover elementor-cta--valign-middle elementor-animated-content elementor-bg-transform elementor-bg-transform-zoom-in elementor-widget elementor-widget-call-to-action\" data-id=\"1d384cf\" data-element_type=\"widget\" data-widget_type=\"call-to-action.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t<a class=\"elementor-cta\" href=\"\/es\/soluciones\/augmented-analytics\/\">\n\t\t\t\t\t<div class=\"elementor-cta__bg-wrapper\">\n\t\t\t\t<div class=\"elementor-cta__bg elementor-bg lazyload\" style=\"background-image:inherit;\" role=\"img\" aria-label=\"AUGMENTED-ANALYTICS-opt\" data-bg-image=\"url(https:\/\/www.bluetab.net\/wp-content\/uploads\/2020\/10\/AUGMENTED-ANALYTICS-opt.jpg)\"><\/div>\n\t\t\t\t<div class=\"elementor-cta__bg-overlay\"><\/div>\n\t\t\t<\/div>\n\t\t\t\t\t\t\t<div class=\"elementor-cta__content\">\n\t\t\t\t\n\t\t\t\t\t\t\t\t\t<h5 class=\"elementor-cta__title elementor-cta__content-item elementor-content-item elementor-animated-item--grow\">\n\t\t\t\t\t\tAUGMENTED ANALYTICS\t\t\t\t\t<\/h5>\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\t\t<\/div>\n\t\t\t\t\t\t<\/a>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-top-column elementor-element elementor-element-f61bae8\" data-id=\"f61bae8\" data-element_type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-f3fe106 elementor-widget elementor-widget-text-editor\" data-id=\"f3fe106\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p>Te puede interesar<\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t<\/div>\n\t\t","protected":false},"excerpt":{"rendered":"<p>Alberto Jaen AWS Cloud Engineer Alfonso Jerez AWS Cloud Engineer Adri\u00e1n Jim\u00e9nez AWS Cloud Engineer Introduction This article is the second in a series of publications focusing on the creation of a LakeHouse with Hudi from a streaming ingest processed by a Flink application. The first article focuses on laying a good foundation for this [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":17826,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"elementor_header_footer","format":"standard","meta":{"inline_featured_image":false,"_uag_custom_page_level_css":"","_genesis_hide_title":false,"_genesis_hide_breadcrumbs":false,"_genesis_hide_singular_image":false,"_genesis_hide_footer_widgets":false,"_genesis_custom_body_class":"","_genesis_custom_post_class":"","_genesis_layout":"content-sidebar","footnotes":""},"categories":[632,668,633],"tags":[],"class_list":{"0":"post-15782","1":"post","2":"type-post","3":"status-publish","4":"format-standard","5":"has-post-thumbnail","7":"category-destacado","8":"category-practices-en","9":"category-tech-en","10":"entry"},"uagb_featured_image_src":{"full":["https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5.png",1200,600,false],"thumbnail":["https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5-150x150.png",150,150,true],"medium":["https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5-300x150.png",300,150,true],"medium_large":["https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5-768x384.png",768,384,true],"large":["https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5-1024x512.png",1024,512,true],"1536x1536":["https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5.png",1200,600,false],"2048x2048":["https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5.png",1200,600,false],"sidebar-featured":["https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5-75x75.png",75,75,true],"genesis-singular-images":["https:\/\/bluetab.es\/wp-content\/uploads\/2023\/10\/5-702x526.png",702,526,true]},"uagb_author_info":{"display_name":"Bluetab","author_link":"https:\/\/bluetab.es\/en\/author\/user\/"},"uagb_comment_info":0,"uagb_excerpt":"Alberto Jaen AWS Cloud Engineer Alfonso Jerez AWS Cloud Engineer Adri\u00e1n Jim\u00e9nez AWS Cloud Engineer Introduction This article is the second in a series of publications focusing on the creation of a LakeHouse with Hudi from a streaming ingest processed by a Flink application. The first article focuses on laying a good foundation for this&hellip;","_links":{"self":[{"href":"https:\/\/bluetab.es\/en\/wp-json\/wp\/v2\/posts\/15782","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/bluetab.es\/en\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/bluetab.es\/en\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/bluetab.es\/en\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/bluetab.es\/en\/wp-json\/wp\/v2\/comments?post=15782"}],"version-history":[{"count":24,"href":"https:\/\/bluetab.es\/en\/wp-json\/wp\/v2\/posts\/15782\/revisions"}],"predecessor-version":[{"id":16026,"href":"https:\/\/bluetab.es\/en\/wp-json\/wp\/v2\/posts\/15782\/revisions\/16026"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/bluetab.es\/en\/wp-json\/wp\/v2\/media\/17826"}],"wp:attachment":[{"href":"https:\/\/bluetab.es\/en\/wp-json\/wp\/v2\/media?parent=15782"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/bluetab.es\/en\/wp-json\/wp\/v2\/categories?post=15782"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/bluetab.es\/en\/wp-json\/wp\/v2\/tags?post=15782"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}