data-analyses-scheduler/Dockerfile.test at dev · benchflow/data-analyses-scheduler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
FROM benchflow/base-images:dns-envconsul-java8_dev

MAINTAINER Vincenzo FERME <info@vincenzoferme.it>

ENV SPARK_HOME /usr/spark
ENV SPARK_VERSION 1.6.2
ENV PYSPARK_PYTHON python2.7
ENV PYSPARK_CASSANDRA_VERSION 0.3.5
ENV HADOOP_VERSION 2.6
ENV GLIBC_VERSION=2.23-r3
ENV LANG=C.UTF-8
ENV MINICONDA_PYTHON_VERSION=2
ENV MINICONDA_VERSION=4.0.5
ENV DATA_ANALYSES_SCHEDULER_VERSION v-dev
ENV DATA_TRANSFORMERS_VERSION v-dev
ENV ANALYSERS_VERSION v-dev
ENV PLUGINS_VERSION v-dev

ENV REGULAR_PYTHON /usr/bin/python2.7
ENV CONDA_PYTHON /opt/conda/bin/python2.7

# Configure environment for miniconda
ENV CONDA_DIR=/opt/conda
ENV PATH=$CONDA_DIR/bin:$PATH

# TODO: remove python, when Spark will be used outside of the container
RUN apk --update add curl wget tar python && \
	# Add Spark
    curl \
	--location \
	--retry 10 \
	http://d3kbcqa49mib13.cloudfront.net/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz \
	| gunzip \
	| tar x -C /usr/ && \
    ln -s /usr/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION /usr/spark

# Separate so that Docker can cache the Spark layer
# Get data-transformers (TODO: checkout git repo to get always the latest version of code)
# TODO: Improve the following code to download only once from github and keep all the wanted files in the right directories
RUN mkdir -p /app/configuration && \
    mkdir -p /app/data-transformers && \
    wget -q --no-check-certificate -O - https://github.com/benchflow/data-transformers/archive/$DATA_TRANSFORMERS_VERSION.tar.gz \
    | tar xz --strip-components=2 -C /app/data-transformers data-transformers-$DATA_TRANSFORMERS_VERSION/data-transformers && \
    # Get data-transformers scheduler configuration file
    wget -q --no-check-certificate -O - https://github.com/benchflow/data-transformers/archive/$DATA_TRANSFORMERS_VERSION.tar.gz \
    | tar xz --strip-components=1 --wildcards --no-anchored '*.scheduler.configuration.yml' && \
    for f in *.scheduler.configuration.yml; do mv -i "$f" "app/configuration/$f"; done  && \
    # Get analysers
    mkdir -p /app/analysers && \
    wget -q --no-check-certificate -O - https://github.com/benchflow/analysers/archive/$ANALYSERS_VERSION.tar.gz \
    | tar xz --strip-components=2 -C /app/analysers analysers-$ANALYSERS_VERSION/analysers && \
    # Get analyser scheduler configuration file
    wget -q --no-check-certificate -O - https://github.com/benchflow/analysers/archive/$ANALYSERS_VERSION.tar.gz \
    | tar xz --strip-components=1 --wildcards --no-anchored '*.scheduler.configuration.yml' && \
    for f in *.scheduler.configuration.yml; do mv -i "$f" "app/configuration/$f"; done  && \
    # Get plugins (configuration files) TODO: do not let the following code fails if nothing is found in the .tar.gunzip
    mkdir -p /app/data-transformers/suts && \
    wget -q --no-check-certificate -O - https://github.com/benchflow/sut-plugins/archive/$PLUGINS_VERSION.tar.gz \
    | tar xz --strip-components=1 -C /app/data-transformers/suts --wildcards --no-anchored 'data-transformers.configuration.yml'
    # TODO: currenlty skipping analysers configuration because we don't have any
    # && \
    # mkdir -p /app/analysers/suts && \
    # wget -q --no-check-certificate -O - https://github.com/benchflow/sut-plugins/archive/$PLUGINS_VERSION.tar.gz \
    # | tar xz --strip-components=1 -C /app/analysers/suts --wildcards --no-anchored 'analysers.configuration.yml'

# Clean up
RUN apk del --purge curl wget tar && \
    rm -rf /var/cache/apk/*

COPY ./configuration.yml /app/configuration.yml

COPY ./dependencies/pyspark-cassandra-assembly-$PYSPARK_CASSANDRA_VERSION.jar $SPARK_HOME/
COPY ./configuration/spark/log4j.properties $SPARK_HOME/conf/

COPY ./bin/data-analyses-scheduler /app/data-analyses-scheduler
RUN chmod +x /app/data-analyses-scheduler

COPY ./services/envcp/config.tpl /app/config.tpl

COPY ./services/300-data-analyses-scheduler.conf /apps/chaperone.d/300-data-analyses-scheduler.conf

#TODO: remove, when Spark will be used as a service outside of this container
COPY ./services/400-clean-tmp-folder.conf /apps/chaperone.d/400-clean-tmp-folder.conf

#TODO: remove, when Spark will be used as a service outside of this container
# disables the Spark UI when launching scripts (http://stackoverflow.com/questions/33774350/how-to-disable-sparkui-programmatically/33784803#33784803)
RUN cp $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.conf \
    && sed -i -e '$aspark.ui.enabled false' $SPARK_HOME/conf/spark-defaults.conf

# Install miniconda and scipy though it, to solve a linking error with scipy (refer to: https://travis-ci.org/benchflow/analysers/builds/147443000#L485)
# Reference for miniconda setup: https://github.com/CognitiveScale/alpine-miniconda/blob/conda-4.0.5/Dockerfile
RUN apk upgrade --update && \
    # Install glibc and other dependencies needed by miniconda
    apk add --update curl ca-certificates bash git bzip2 unzip sudo libstdc++ glib libxext libxrender && \
    for pkg in glibc-${GLIBC_VERSION} glibc-bin-${GLIBC_VERSION} glibc-i18n-${GLIBC_VERSION}; do curl -sSL https://github.com/andyshinn/alpine-pkg-glibc/releases/download/${GLIBC_VERSION}/${pkg}.apk -o /tmp/${pkg}.apk; done && \
    apk add --allow-untrusted /tmp/*.apk && \
    rm -v /tmp/*.apk && \
    ( /usr/glibc-compat/bin/localedef --force --inputfile POSIX --charmap UTF-8 C.UTF-8 || true ) && \
    echo "export LANG=C.UTF-8" > /etc/profile.d/locale.sh && \
    /usr/glibc-compat/sbin/ldconfig /lib /usr/glibc-compat/lib && \
    # Install miniconda
    mkdir -p $CONDA_DIR && \
    echo export PATH=$CONDA_DIR/bin:'$PATH' > /etc/profile.d/conda.sh && \
    curl https://repo.continuum.io/miniconda/Miniconda${MINICONDA_PYTHON_VERSION}-${MINICONDA_VERSION}-Linux-x86_64.sh  -o mconda.sh && \
    /bin/bash mconda.sh -f -b -p $CONDA_DIR && \
    rm mconda.sh && \
    $CONDA_DIR/bin/conda install --yes conda==${MINICONDA_VERSION} && \
    # Install scipy using miniconda
    conda install scipy && \
    conda install pytz && \
    conda install -c anaconda dateutil=2.4.1 && \
    # TODO: evaluate if possible to remove miniconda and its dependencies (git, bzip2, unzip, sudo, libstdc++, glib, libxext, libxrender) at this point
    apk del glibc-i18n

# adds Alpine's testing repository and install scripts dependencies (py-numpy, py-yaml)
RUN sed -i -e '$a@testing http://dl-4.alpinelinux.org/alpine/edge/testing' /etc/apk/repositories \
    && apk --update add py-yaml py-dateutil

# adds pip and install scripts dependencies (future)
RUN apk --update add py-pip \
    && pip install --upgrade pip \
    && ${REGULAR_PYTHON} /usr/bin/pip install future \
    && ${REGULAR_PYTHON} /usr/bin/pip install minio \
    && apk del --purge py-pip \
    && rm -rf /var/cache/apk/*

CMD /app/data-analyses-scheduler

EXPOSE 8080