From 9efce66f09f3c77a61887f0999a1fe5f9b84b516 Mon Sep 17 00:00:00 2001 From: Raghav Aggarwal Date: Fri, 23 Jan 2026 02:14:34 +0530 Subject: [PATCH] TEZ-4682: [Cloud] Tez AM docker image --- .../org/apache/tez/dag/app/DAGAppMaster.java | 2 +- tez-dist/pom.xml | 32 ++++ tez-dist/src/docker/Dockerfile | 89 ++++++++++ tez-dist/src/docker/README.md | 52 ++++++ tez-dist/src/docker/build-docker.sh | 128 +++++++++++++++ tez-dist/src/docker/conf/log4j2.properties | 25 +++ tez-dist/src/docker/conf/tez-site.xml | 61 +++++++ tez-dist/src/docker/entrypoint.sh | 153 ++++++++++++++++++ tez-dist/src/docker/tez.env | 31 ++++ 9 files changed, 572 insertions(+), 1 deletion(-) create mode 100644 tez-dist/src/docker/Dockerfile create mode 100644 tez-dist/src/docker/README.md create mode 100755 tez-dist/src/docker/build-docker.sh create mode 100644 tez-dist/src/docker/conf/log4j2.properties create mode 100644 tez-dist/src/docker/conf/tez-site.xml create mode 100644 tez-dist/src/docker/entrypoint.sh create mode 100644 tez-dist/src/docker/tez.env diff --git a/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java b/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java index a8b76204bd..aff76220e5 100644 --- a/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java +++ b/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java @@ -2429,7 +2429,7 @@ public static void main(String[] args) { Objects.requireNonNull(appSubmitTimeStr, ApplicationConstants.APP_SUBMIT_TIME_ENV + " is null"); - Configuration conf = new Configuration(); + Configuration conf = new TezConfiguration(); AMExtensions amExtensions = getFrameworkService(conf).getAMExtensions(); DAGProtos.ConfigurationProto confProto = amExtensions.loadConfigurationProto(); diff --git a/tez-dist/pom.xml b/tez-dist/pom.xml index 9777d0c0b9..31dae3a28e 100644 --- a/tez-dist/pom.xml +++ b/tez-dist/pom.xml @@ -118,6 +118,38 @@ + + docker + + + + org.codehaus.mojo + exec-maven-plugin + + + build-docker-image + package + + exec + + + /bin/bash + + ${project.basedir}/src/docker/build-docker.sh + -hadoop + ${hadoop.version} + -tez + ${project.version} + -repo + apache + + + + + + + + diff --git a/tez-dist/src/docker/Dockerfile b/tez-dist/src/docker/Dockerfile new file mode 100644 index 0000000000..3c3bcc170c --- /dev/null +++ b/tez-dist/src/docker/Dockerfile @@ -0,0 +1,89 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +ARG BUILD_ENV=unarchive + +# hadolint ignore=DL3006 +FROM ubuntu AS unarchive +ONBUILD COPY hadoop-*.tar.gz /opt +# UPDATED: Matches "tez-1.0.0-SNAPSHOT.tar.gz" pattern +ONBUILD COPY tez-*.tar.gz /opt + +FROM ${BUILD_ENV} AS env +ARG HADOOP_VERSION +ARG TEZ_VERSION + +RUN mkdir -p /opt/hadoop \ + && tar -xzv \ + --exclude="hadoop-$HADOOP_VERSION/share/doc" \ + --exclude="*/jdiff" \ + --exclude="*/sources" \ + --exclude="*tests.jar" \ + --exclude="*/webapps" \ + -f /opt/hadoop-$HADOOP_VERSION.tar.gz \ + -C /opt/hadoop --strip-components 1 \ + && mkdir -p /opt/tez \ + && tar -xzv \ + -f /opt/tez-$TEZ_VERSION.tar.gz \ + -C /opt/tez \ + && rm -rf /opt/hadoop-$HADOOP_VERSION.tar.gz /opt/tez-$TEZ_VERSION.tar.gz + +FROM eclipse-temurin:21.0.3_9-jre-ubi9-minimal AS run + +ARG UID=1000 +ARG HADOOP_VERSION +ARG TEZ_VERSION + +# Install dependencies +# hadolint ignore=DL3041 +RUN set -ex; \ + microdnf update -y; \ + microdnf -y install procps gettext findutils; \ + microdnf clean all; \ + useradd --no-create-home -s /sbin/nologin -c "" --uid $UID tez + +# Set necessary environment variables +ENV HADOOP_HOME=/opt/hadoop \ + TEZ_HOME=/opt/tez \ + TEZ_CONF_DIR=/opt/tez/conf \ + HADOOP_CONF_DIR=/opt/tez/conf + +ENV TEZ_CLIENT_VERSION=$TEZ_VERSION + +ENV PATH=$TEZ_HOME/bin:$HADOOP_HOME/bin:$PATH + +COPY --from=env --chown=tez /opt/hadoop $HADOOP_HOME +# UPDATED: Copy from the normalized directory name created in 'env' stage +COPY --from=env --chown=tez /opt/tez $TEZ_HOME + +RUN mkdir -p $TEZ_CONF_DIR && chown tez:tez $TEZ_CONF_DIR + +COPY --chown=tez entrypoint.sh / +COPY --chown=tez conf $TEZ_CONF_DIR + +# Create Extension Point Directory +RUN mkdir -p /opt/tez/plugins && chown tez:tez /opt/tez/plugins && chmod 755 /opt/tez/plugins + +RUN chmod +x /entrypoint.sh + +USER tez +WORKDIR $TEZ_HOME + +# Expose AM ports via -p flag in docker command +# EXPOSE 10001 10002 10003 8042 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/tez-dist/src/docker/README.md b/tez-dist/src/docker/README.md new file mode 100644 index 0000000000..22a370e4cd --- /dev/null +++ b/tez-dist/src/docker/README.md @@ -0,0 +1,52 @@ + + +# Tez AM Docker + +1. Building the docker image: + + ```bash + mvn clean install -DskipTests -Pdocker,tools + ``` + +2. Install zookeeper in mac by: + + ```bash + brew install zookeeper + zkServer start + ``` + +3. Running the Tez AM container: + + ```bash + docker run \ + -p 10001:10001 -p 8042:8042 \ + --name tez-am \ + apache/tez-am:1.0.0-SNAPSHOT + ``` + +4. Debugging the Tez AM container: + + ```bash + docker run \ + -p 10001:10001 -p 8042:8042 -p 5005:5005 \ + -e TEZ_FRAMEWORK_MODE="STANDALONE_ZOOKEEPER" \ + -e JAVA_TOOL_OPTIONS='-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005' \ + --name tez-am \ + apache/tez-am:1.0.0-SNAPSHOT + ``` diff --git a/tez-dist/src/docker/build-docker.sh b/tez-dist/src/docker/build-docker.sh new file mode 100755 index 0000000000..fabe94ed77 --- /dev/null +++ b/tez-dist/src/docker/build-docker.sh @@ -0,0 +1,128 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -xeou pipefail + +HADOOP_VERSION= +TEZ_VERSION= +REPO= + +usage() { + cat <&2 +Usage: $0 [-h] [-hadoop ] [-tez ] [-repo ] +Build the Apache Tez AM Docker image +-help Display help +-hadoop Build image with the specified Hadoop version +-tez Build image with the specified Tez version +-repo Docker repository +EOF +} + +while [ $# -gt 0 ]; do + case "$1" in + -h) + usage + exit 0 + ;; + -hadoop) + shift + HADOOP_VERSION=$1 + shift + ;; + -tez) + shift + TEZ_VERSION=$1 + shift + ;; + -repo) + shift + REPO=$1 + shift + ;; + *) + shift + ;; + esac +done + +SCRIPT_DIR=$( + cd "$(dirname "$0")" + pwd +) + +DIST_DIR=${DIST_DIR:-"$SCRIPT_DIR/../.."} +PROJECT_ROOT=${PROJECT_ROOT:-"$SCRIPT_DIR/../../.."} + +repo=${REPO:-apache} +WORK_DIR="$(mktemp -d)" +CACHE_DIR="$SCRIPT_DIR/cache" +mkdir -p "$CACHE_DIR" + +# Defaults Hadoop and Tez versions from pom.xml if not provided +HADOOP_VERSION=${HADOOP_VERSION:-$(mvn -f "$PROJECT_ROOT/pom.xml" -q help:evaluate -Dexpression=hadoop.version -DforceStdout)} +TEZ_VERSION=${TEZ_VERSION:-$(mvn -f "$PROJECT_ROOT/pom.xml" -q help:evaluate -Dexpression=project.version -DforceStdout)} + +###################### +# HADOOP FETCH LOGIC # +###################### +HADOOP_FILE_NAME="hadoop-$HADOOP_VERSION.tar.gz" +HADOOP_URL=${HADOOP_URL:-"https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/$HADOOP_FILE_NAME"} +if [ ! -f "$CACHE_DIR/$HADOOP_FILE_NAME" ]; then + echo "Downloading Hadoop from $HADOOP_URL..." + if ! curl --fail -L "$HADOOP_URL" -o "$CACHE_DIR/$HADOOP_FILE_NAME.tmp"; then + echo "Fail to download Hadoop, exiting...." + exit 1 + fi + mv "$CACHE_DIR/$HADOOP_FILE_NAME.tmp" "$CACHE_DIR/$HADOOP_FILE_NAME" +fi + +##################################### +# Pick tez tarball from local build # +##################################### +TEZ_FILE_NAME="tez-$TEZ_VERSION.tar.gz" +LOCAL_DIST_PATH="$DIST_DIR/target/$TEZ_FILE_NAME" + +if [ -f "$LOCAL_DIST_PATH" ]; then + echo "--> Found local Tez build artifact at: $LOCAL_DIST_PATH" + cp "$LOCAL_DIST_PATH" "$WORK_DIR/" +else + echo "--> Error: Local Tez artifact not found at $LOCAL_DIST_PATH" + echo "--> Please build the project first (e.g., mvn clean install -DskipTests)." + exit 1 +fi + +# ------------------------------------------------------------------------- +# BUILD CONTEXT PREPARATION +# ------------------------------------------------------------------------- +cp "$CACHE_DIR/$HADOOP_FILE_NAME" "$WORK_DIR/" +cp -R "$SCRIPT_DIR/conf" "$WORK_DIR/" 2>/dev/null || mkdir -p "$WORK_DIR/conf" +cp "$SCRIPT_DIR/entrypoint.sh" "$WORK_DIR/" +cp "$SCRIPT_DIR/Dockerfile" "$WORK_DIR/" + +echo "Building Docker image..." +docker build \ + "$WORK_DIR" \ + -f "$WORK_DIR/Dockerfile" \ + -t "$repo/tez-am:$TEZ_VERSION" \ + --build-arg "BUILD_ENV=unarchive" \ + --build-arg "HADOOP_VERSION=$HADOOP_VERSION" \ + --build-arg "TEZ_VERSION=$TEZ_VERSION" + +rm -r "${WORK_DIR}" +echo "Docker image $repo/tez-am:$TEZ_VERSION built successfully." diff --git a/tez-dist/src/docker/conf/log4j2.properties b/tez-dist/src/docker/conf/log4j2.properties new file mode 100644 index 0000000000..ddccb1b184 --- /dev/null +++ b/tez-dist/src/docker/conf/log4j2.properties @@ -0,0 +1,25 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{ISO8601} %5p [%t] %c{2}: %m%n + +rootLogger.level = INFO +rootLogger.appenderRef.console.ref = console diff --git a/tez-dist/src/docker/conf/tez-site.xml b/tez-dist/src/docker/conf/tez-site.xml new file mode 100644 index 0000000000..a38a9ae6d0 --- /dev/null +++ b/tez-dist/src/docker/conf/tez-site.xml @@ -0,0 +1,61 @@ + + + + + + + tez.am.client.am.port-range + 10001-10003 + + + + tez.am.resource.memory.mb + 1024 + + + + tez.framework.mode + STANDALONE_ZOOKEEPER + + + + tez.am.tez-ui.webservice.enable + false + + + + tez.am.zookeeper.quorum + host.docker.internal:2181 + + + + tez.am.log.level + DEBUG + + + + tez.am.mode.session + true + + + + tez.local.mode + true + + + diff --git a/tez-dist/src/docker/entrypoint.sh b/tez-dist/src/docker/entrypoint.sh new file mode 100644 index 0000000000..543c580ff2 --- /dev/null +++ b/tez-dist/src/docker/entrypoint.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -xeou pipefail + +################################################ +# 1. Mocking DAGAppMaster#main() env variables # +################################################ + +: "${CONTAINER_ID:="container_1700000000000_0001_01_000001"}" +: "${USER:="tez"}" +: "${HADOOP_USER_NAME:="tez"}" +: "${NM_HOST:="localhost"}" +: "${NM_PORT:="12345"}" +: "${NM_HTTP_PORT:="8042"}" +: "${LOCAL_DIRS:="/tmp"}" +: "${LOG_DIRS:="/opt/tez/logs"}" +: "${APP_SUBMIT_TIME_ENV:=$(($(date +%s) * 1000))}" +: "${TEZ_AM_EXTERNAL_ID:="tez-session-$(hostname)"}" + +export CONTAINER_ID USER HADOOP_USER_NAME NM_HOST NM_PORT NM_HTTP_PORT \ + LOCAL_DIRS LOG_DIRS APP_SUBMIT_TIME_ENV TEZ_AM_EXTERNAL_ID + +if [[ ! -f "tez-conf.pb" ]]; then + touch "tez-conf.pb" + echo "--> Created dummy tez-conf.pb" +fi + +if [[ ! -f "tez-dag.pb" ]]; then + touch "tez-dag.pb" + echo "--> Created dummy tez-dag.pb" +fi + +mkdir -p "$LOG_DIRS" + +########################## +# CONFIGURATION HANDLING # +########################## + +# Symlink hadoop conf in tez conf dir +if [[ -d "$HADOOP_HOME/etc/hadoop" ]]; then + echo "--> Linking missing Hadoop configs to $TEZ_CONF_DIR..." + for f in "$HADOOP_HOME/etc/hadoop"/*; do + basename=$(basename "$f") + # this check helps in case user wants to provide its custom hfds-site.xml + # or any other configuration file + if [[ ! -e "$TEZ_CONF_DIR/$basename" ]]; then + ln -s "$f" "$TEZ_CONF_DIR/$basename" + fi + done +fi + +########################### +# Custom Config directory # +########################### +if [[ -n "${TEZ_CUSTOM_CONF_DIR:-}" ]] && [[ -d "$TEZ_CUSTOM_CONF_DIR" ]]; then + echo "--> Using custom configuration directory: $TEZ_CUSTOM_CONF_DIR" + find "${TEZ_CUSTOM_CONF_DIR}" -type f -exec \ + ln -sf {} "${TEZ_CONF_DIR}"/ \; + + # Remove template keyword if it exist + if [[ -f "$TEZ_CONF_DIR/tez-site.xml.template" ]]; then + envsubst < "$TEZ_CONF_DIR/tez-site.xml.template" > "$TEZ_CONF_DIR/tez-site.xml" + fi +fi + +############# +# CLASSPATH # +############# + +export HADOOP_USER_CLASSPATH_FIRST=true +# Order is: conf -> plugins -> tez jars -> hadoop jars +CLASSPATH="${TEZ_CONF_DIR}" + +# Custom Plugins +# This allows mounting a volume at /opt/tez/plugins containing aux jars +PLUGIN_DIR="/opt/tez/plugins" +if [[ -d "$PLUGIN_DIR" ]]; then + count=$(find "$PLUGIN_DIR" -maxdepth 1 -name "*.jar" 2>/dev/null | wc -l) + if [ "$count" != "0" ]; then + echo "--> Found $count plugin jars. Prepending to classpath." + CLASSPATH="${CLASSPATH}:${PLUGIN_DIR}/*" + fi +fi + +# Tez Jars +CLASSPATH="${CLASSPATH}:${TEZ_HOME}/*:${TEZ_HOME}/lib/*" + +# Hadoop Jars +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/lib/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/hdfs/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/hdfs/lib/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/yarn/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/yarn/lib/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/mapreduce/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/mapreduce/lib/*" + +############# +# Execution # +############# +TEZ_DAG_JAR=$(find "$TEZ_HOME" -maxdepth 1 -name "tez-dag-*.jar" ! -name "*-tests.jar" | head -n 1) + +if [ -z "$TEZ_DAG_JAR" ]; then + echo "Error: Could not find tez-dag-*.jar in $TEZ_HOME" + exit 1 +fi + +echo "--> Starting DAGAppMaster..." +echo "--> HADOOP_CONF_DIR: $HADOOP_CONF_DIR" + +: "${TEZ_AM_HEAP_OPTS:="-Xmx2048m"}" + +# Check for Log4j2 Configuration +LOG4J2_FILE="$TEZ_CONF_DIR/log4j2.properties" +if [[ -f "$LOG4J2_FILE" ]]; then + echo "--> [TEZ-AM] Found Log4j2 configuration: $LOG4J2_FILE" + JAVA_OPTS="${JAVA_OPTS:+$JAVA_OPTS }-Dlog4j.configurationFile=file:$LOG4J2_FILE" +fi + +JAVA_ADD_OPENS=( + "--add-opens=java.base/java.lang=ALL-UNNAMED" + "--add-opens=java.base/java.util=ALL-UNNAMED" + "--add-opens=java.base/java.io=ALL-UNNAMED" +) + +read -r -a JAVA_OPTS_ARR <<< "${JAVA_OPTS:-}" +read -r -a HEAP_OPTS_ARR <<< "${TEZ_AM_HEAP_OPTS}" + +exec java "${HEAP_OPTS_ARR[@]}" "${JAVA_OPTS_ARR[@]}" "${JAVA_ADD_OPENS[@]}" \ + -Duser.name="$HADOOP_USER_NAME" \ + -Djava.library.path="$HADOOP_HOME/lib/native" \ + -Dhadoop.home.dir="$HADOOP_HOME" \ + -Dhadoop.log.dir="$LOG_DIRS" \ + -Dtez.conf.dir="$TEZ_CONF_DIR" \ + -cp "$CLASSPATH" \ + org.apache.tez.dag.app.DAGAppMaster --session \ + "$@" diff --git a/tez-dist/src/docker/tez.env b/tez-dist/src/docker/tez.env new file mode 100644 index 0000000000..ce7d4d278f --- /dev/null +++ b/tez-dist/src/docker/tez.env @@ -0,0 +1,31 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Tez AM Container Environment Configuration + +HADOOP_USER_NAME=tez +USER=tez +CONTAINER_ID=container_1700000000000_0001_01_000001 +NM_HOST=localhost +NM_PORT=12345 +NM_HTTP_PORT=8042 +TEZ_FRAMEWORK_MODE=STANDALONE_ZOOKEEPER +TEZ_AM_ZOOKEEPER_QUORUM=host.docker.internal:2181 +TEZ_AM_LOG_LEVEL=INFO +# TEZ_CUSTOM_CONF_DIR=/opt/tez/custom-conf +# Enable remote debugging on port 5005 +#JAVA_TOOL_OPTIONS='-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005'