diff --git a/roles/download/defaults/main.yml b/roles/download/defaults/main.yml
index 2c80ffae5..8eee9fd2f 100644
--- a/roles/download/defaults/main.yml
+++ b/roles/download/defaults/main.yml
@@ -132,14 +132,14 @@ kubednsautoscaler_image_repo: "gcr.io/google_containers/cluster-proportional-aut
kubednsautoscaler_image_tag: "{{ kubednsautoscaler_version }}"
test_image_repo: busybox
test_image_tag: latest
-elasticsearch_version: "v2.4.1"
-elasticsearch_image_repo: "gcr.io/google_containers/elasticsearch"
+elasticsearch_version: "v5.6.4"
+elasticsearch_image_repo: "k8s.gcr.io/elasticsearch"
elasticsearch_image_tag: "{{ elasticsearch_version }}"
-fluentd_version: "1.22"
-fluentd_image_repo: "gcr.io/google_containers/fluentd-elasticsearch"
+fluentd_version: "v2.0.4"
+fluentd_image_repo: "k8s.gcr.io/fluentd-elasticsearch"
fluentd_image_tag: "{{ fluentd_version }}"
-kibana_version: "v4.6.1"
-kibana_image_repo: "gcr.io/google_containers/kibana"
+kibana_version: "5.6.4"
+kibana_image_repo: "docker.elastic.co/kibana/kibana"
kibana_image_tag: "{{ kibana_version }}"
helm_version: "v2.9.1"
helm_image_repo: "lachlanevenson/k8s-helm"
diff --git a/roles/kubernetes-apps/efk/elasticsearch/templates/efk-clusterrolebinding.yml b/roles/kubernetes-apps/efk/elasticsearch/templates/efk-clusterrolebinding.yml
index dd5b9b630..4b9ab0067 100644
--- a/roles/kubernetes-apps/efk/elasticsearch/templates/efk-clusterrolebinding.yml
+++ b/roles/kubernetes-apps/efk/elasticsearch/templates/efk-clusterrolebinding.yml
@@ -1,9 +1,12 @@
---
kind: ClusterRoleBinding
-apiVersion: rbac.authorization.k8s.io/v1beta1
+apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: efk
namespace: kube-system
+ labels:
+ kubernetes.io/cluster-service: "true"
+ addonmanager.kubernetes.io/mode: Reconcile
subjects:
- kind: ServiceAccount
name: efk
diff --git a/roles/kubernetes-apps/efk/elasticsearch/templates/efk-sa.yml b/roles/kubernetes-apps/efk/elasticsearch/templates/efk-sa.yml
index 75d75f650..01e774e96 100644
--- a/roles/kubernetes-apps/efk/elasticsearch/templates/efk-sa.yml
+++ b/roles/kubernetes-apps/efk/elasticsearch/templates/efk-sa.yml
@@ -6,3 +6,4 @@ metadata:
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
+ addonmanager.kubernetes.io/mode: Reconcile
diff --git a/roles/kubernetes-apps/efk/elasticsearch/templates/elasticsearch-deployment.yml.j2 b/roles/kubernetes-apps/efk/elasticsearch/templates/elasticsearch-deployment.yml.j2
index 4cdcf33ad..51666c1f2 100644
--- a/roles/kubernetes-apps/efk/elasticsearch/templates/elasticsearch-deployment.yml.j2
+++ b/roles/kubernetes-apps/efk/elasticsearch/templates/elasticsearch-deployment.yml.j2
@@ -1,15 +1,17 @@
---
-# https://raw.githubusercontent.com/kubernetes/kubernetes/v1.5.2/cluster/addons/fluentd-elasticsearch/es-controller.yaml
-apiVersion: extensions/v1beta1
-kind: Deployment
+# https://raw.githubusercontent.com/kubernetes/kubernetes/v1.10.2/cluster/addons/fluentd-elasticsearch/es-statefulset.yaml
+apiVersion: apps/v1
+kind: StatefulSet
metadata:
- name: elasticsearch-logging-v1
+ name: elasticsearch-logging
namespace: kube-system
labels:
k8s-app: elasticsearch-logging
version: "{{ elasticsearch_image_tag }}"
kubernetes.io/cluster-service: "true"
+ addonmanager.kubernetes.io/mode: Reconcile
spec:
+ serviceName: elasticsearch-logging
replicas: 2
selector:
matchLabels:
@@ -53,4 +55,10 @@ spec:
{% if rbac_enabled %}
serviceAccountName: efk
{% endif %}
+ initContainers:
+ - image: alpine:3.6
+ command: ["/sbin/sysctl", "-w", "vm.max_map_count=262144"]
+ name: elasticsearch-logging-init
+ securityContext:
+ privileged: true
diff --git a/roles/kubernetes-apps/efk/fluentd/defaults/main.yml b/roles/kubernetes-apps/efk/fluentd/defaults/main.yml
index e8d93732c..0305a5f7a 100644
--- a/roles/kubernetes-apps/efk/fluentd/defaults/main.yml
+++ b/roles/kubernetes-apps/efk/fluentd/defaults/main.yml
@@ -1,7 +1,7 @@
---
fluentd_cpu_limit: 0m
-fluentd_mem_limit: 200Mi
+fluentd_mem_limit: 500Mi
fluentd_cpu_requests: 100m
fluentd_mem_requests: 200Mi
-fluentd_config_dir: /etc/kubernetes/fluentd
-fluentd_config_file: fluentd.conf
+fluentd_config_dir: /etc/fluent/config.d
+# fluentd_config_file: fluentd.conf
diff --git a/roles/kubernetes-apps/efk/fluentd/templates/fluentd-config.yml.j2 b/roles/kubernetes-apps/efk/fluentd/templates/fluentd-config.yml.j2
index b7de44dc0..0b0229f69 100644
--- a/roles/kubernetes-apps/efk/fluentd/templates/fluentd-config.yml.j2
+++ b/roles/kubernetes-apps/efk/fluentd/templates/fluentd-config.yml.j2
@@ -1,10 +1,19 @@
+---
+# https://raw.githubusercontent.com/kubernetes/kubernetes/release-1.10/cluster/addons/fluentd-elasticsearch/fluentd-es-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: fluentd-config
namespace: "kube-system"
+ labels:
+ addonmanager.kubernetes.io/mode: Reconcile
data:
- {{ fluentd_config_file }}: |
+ system.conf: |-
+
+ root_dir /tmp/fluentd-buffers/
+
+
+ containers.input.conf: |-
# This configuration file for Fluentd / td-agent is used
# to watch changes to Docker log files. The kubelet creates symlinks that
# capture the pod name, namespace, container name & Docker container ID
@@ -18,7 +27,6 @@ data:
# See https://github.com/uken/fluent-plugin-elasticsearch &
# https://github.com/fabric8io/fluent-plugin-kubernetes_metadata_filter for
# more information about the plugins.
- # Maintainer: Jimmi Dyson
#
# Example
# =======
@@ -99,63 +107,87 @@ data:
# This makes it easier for users to search for logs by pod name or by
# the name of the Kubernetes container regardless of how many times the
# Kubernetes pod has been restarted (resulting in a several Docker container IDs).
- #
- # TODO: Propagate the labels associated with a container along with its logs
- # so users can query logs using labels as well as or instead of the pod name
- # and container name. This is simply done via configuration of the Kubernetes
- # fluentd plugin but requires secrets to be enabled in the fluent pod. This is a
- # problem yet to be solved as secrets are not usable in static pods which the fluentd
- # pod must be until a per-node controller is available in Kubernetes.
- # Prevent fluentd from handling records containing its own logs. Otherwise
- # it can lead to an infinite loop, when error in sending one message generates
- # another message which also fails to be sent and so on.
-
- type null
-
- # Example:
+
+ # Json Log Example:
# {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"}
+ # CRI Log Example:
+ # 2016-02-17T00:04:05.931087621Z stdout F [info:2016-02-16T16:04:05.930-08:00] Some log text here
- type tail
+ @id fluentd-containers.log
+ @type tail
path /var/log/containers/*.log
pos_file /var/log/es-containers.log.pos
time_format %Y-%m-%dT%H:%M:%S.%NZ
- tag kubernetes.*
- format json
+ tag raw.kubernetes.*
read_from_head true
+
+ @type multi_format
+
+ format json
+ time_key time
+ time_format %Y-%m-%dT%H:%M:%S.%NZ
+
+
+ format /^(?
+
+
+ # Detect exceptions in the log output and forward them as one log entry.
+
+ @id raw.kubernetes
+ @type detect_exceptions
+ remove_tag_prefix raw
+ message log
+ stream stream
+ multiline_flush_interval 5
+ max_bytes 500000
+ max_lines 1000
+
+
+ system.input.conf: |-
# Example:
# 2015-12-21 23:17:22,066 [salt.state ][INFO ] Completed state [net.ipv4.ip_forward] at time 23:17:22.066081
- type tail
+ @id minion
+ @type tail
format /^(?
+
# Example:
# Dec 21 23:17:22 gke-foo-1-1-4b5cbd14-node-4eoj startupscript: Finished running startup script /var/run/google.startup.script
- type tail
+ @id startupscript.log
+ @type tail
format syslog
path /var/log/startupscript.log
pos_file /var/log/es-startupscript.log.pos
tag startupscript
+
# Examples:
# time="2016-02-04T06:51:03.053580605Z" level=info msg="GET /containers/json"
# time="2016-02-04T07:53:57.505612354Z" level=error msg="HTTP Error" err="No such image: -f" statusCode=404
+ # TODO(random-liu): Remove this after cri container runtime rolls out.
- type tail
+ @id docker.log
+ @type tail
format /^time="(?
+
# Example:
# 2016/02/04 06:52:38 filePurge: successfully removed file /var/etcd/data/member/wal/00000000000006d0-00000000010a23d1.wal
- type tail
+ @id etcd.log
+ @type tail
# Not parsing this, because it doesn't have anything particularly useful to
# parse out of it (like severities).
format none
@@ -163,13 +195,16 @@ data:
pos_file /var/log/es-etcd.log.pos
tag etcd
+
# Multi-line parsing is required for all the kube logs because very large log
# statements, such as those that include entire object bodies, get split into
# multiple lines by glog.
+
# Example:
# I0204 07:32:30.020537 3368 server.go:1048] POST /stats/container/: (13.972191ms) 200 [[Go-http-client/1.1] 10.244.1.3:40537]
- type tail
+ @id kubelet.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -179,10 +214,12 @@ data:
pos_file /var/log/es-kubelet.log.pos
tag kubelet
+
# Example:
# I1118 21:26:53.975789 6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed
- type tail
+ @id kube-proxy.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -192,10 +229,12 @@ data:
pos_file /var/log/es-kube-proxy.log.pos
tag kube-proxy
+
# Example:
# I0204 07:00:19.604280 5 handlers.go:131] GET /api/v1/nodes: (1.624207ms) 200 [[kube-controller-manager/v1.1.3 (linux/amd64) kubernetes/6a81b50] 127.0.0.1:38266]
- type tail
+ @id kube-apiserver.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -205,10 +244,12 @@ data:
pos_file /var/log/es-kube-apiserver.log.pos
tag kube-apiserver
+
# Example:
# I0204 06:55:31.872680 5 servicecontroller.go:277] LB already exists and doesn't need update for service kube-system/kube-ui
- type tail
+ @id kube-controller-manager.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -218,10 +259,12 @@ data:
pos_file /var/log/es-kube-controller-manager.log.pos
tag kube-controller-manager
+
# Example:
# W0204 06:49:18.239674 7 reflector.go:245] pkg/scheduler/factory/factory.go:193: watch of *api.Service ended with: 401: The event in requested index is outdated and cleared (the requested history has been cleared [2578313/2577886]) [2579312]
- type tail
+ @id kube-scheduler.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -231,10 +274,12 @@ data:
pos_file /var/log/es-kube-scheduler.log.pos
tag kube-scheduler
+
# Example:
# I1104 10:36:20.242766 5 rescheduler.go:73] Running Rescheduler
- type tail
+ @id rescheduler.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -244,10 +289,12 @@ data:
pos_file /var/log/es-rescheduler.log.pos
tag rescheduler
+
# Example:
# I0603 15:31:05.793605 6 cluster_manager.go:230] Reading config from path /etc/gce.conf
- type tail
+ @id glbc.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -257,10 +304,12 @@ data:
pos_file /var/log/es-glbc.log.pos
tag glbc
+
# Example:
# I0603 15:31:05.793605 6 cluster_manager.go:230] Reading config from path /etc/gce.conf
- type tail
+ @id cluster-autoscaler.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -270,59 +319,123 @@ data:
pos_file /var/log/es-cluster-autoscaler.log.pos
tag cluster-autoscaler
+
+ # Logs from systemd-journal for interesting services.
+ # TODO(random-liu): Remove this after cri container runtime rolls out.
+
+ @id journald-docker
+ @type systemd
+ filters [{ "_SYSTEMD_UNIT": "docker.service" }]
+
+ @type local
+ persistent true
+
+ read_from_head true
+ tag docker
+
+
+ #
+ # @id journald-container-runtime
+ # @type systemd
+ # filters [{ "_SYSTEMD_UNIT": "{% raw %}{{ container_runtime }} {% endraw %}.service" }]
+ #
+ # @type local
+ # persistent true
+ #
+ # read_from_head true
+ # tag container-runtime
+ #
+
+
+ @id journald-kubelet
+ @type systemd
+ filters [{ "_SYSTEMD_UNIT": "kubelet.service" }]
+
+ @type local
+ persistent true
+
+ read_from_head true
+ tag kubelet
+
+
+
+ @id journald-node-problem-detector
+ @type systemd
+ filters [{ "_SYSTEMD_UNIT": "node-problem-detector.service" }]
+
+ @type local
+ persistent true
+
+ read_from_head true
+ tag node-problem-detector
+
+
+ forward.input.conf: |-
+ # Takes the messages sent over TCP
+
+ @type forward
+
+
+ monitoring.conf: |-
+ # Prometheus Exporter Plugin
+ # input plugin that exports metrics
+
+ @type prometheus
+
+
+
+ @type monitor_agent
+
+
+ # input plugin that collects metrics from MonitorAgent
+
+ @type prometheus_monitor
+
+ host ${hostname}
+
+
+
+ # input plugin that collects metrics for output plugin
+
+ @type prometheus_output_monitor
+
+ host ${hostname}
+
+
+
+ # input plugin that collects metrics for in_tail plugin
+
+ @type prometheus_tail_monitor
+
+ host ${hostname}
+
+
+
+ output.conf: |-
+ # Enriches records with Kubernetes metadata
- type kubernetes_metadata
+ @type kubernetes_metadata
- ## Prometheus Exporter Plugin
- ## input plugin that exports metrics
- #
- # type prometheus
- #
- #
- # type monitor_agent
- #
- #
- # type forward
- #
- ## input plugin that collects metrics from MonitorAgent
- #
- # @type prometheus_monitor
- #
- # host ${hostname}
- #
- #
- ## input plugin that collects metrics for output plugin
- #
- # @type prometheus_output_monitor
- #
- # host ${hostname}
- #
- #
- ## input plugin that collects metrics for in_tail plugin
- #
- # @type prometheus_tail_monitor
- #
- # host ${hostname}
- #
- #
+
- type elasticsearch
- user "#{ENV['FLUENT_ELASTICSEARCH_USER']}"
- password "#{ENV['FLUENT_ELASTICSEARCH_PASSWORD']}"
- log_level info
- include_tag_key true
- host elasticsearch-logging
- port 9200
- logstash_format true
- # Set the chunk limit the same as for fluentd-gcp.
- buffer_chunk_limit 2M
- # Cap buffer memory usage to 2MiB/chunk * 32 chunks = 64 MiB
- buffer_queue_limit 32
- flush_interval 5s
- # Never wait longer than 5 minutes between retries.
- max_retry_wait 30
- # Disable the limit on the number of retries (retry forever).
- disable_retry_limit
- # Use multiple threads for processing.
- num_threads 8
-
+ @id elasticsearch
+ @type elasticsearch
+ @log_level info
+ include_tag_key true
+ host elasticsearch-logging
+ port 9200
+ logstash_format true
+
+ @type file
+ path /var/log/fluentd-buffers/kubernetes.system.buffer
+ flush_mode interval
+ retry_type exponential_backoff
+ flush_thread_count 2
+ flush_interval 5s
+ retry_forever
+ retry_max_interval 30
+ chunk_limit_size 2M
+ queue_limit_length 8
+ overflow_action block
+
+
\ No newline at end of file
diff --git a/roles/kubernetes-apps/efk/fluentd/templates/fluentd-ds.yml.j2 b/roles/kubernetes-apps/efk/fluentd/templates/fluentd-ds.yml.j2
index f23a8851c..3a911cf38 100644
--- a/roles/kubernetes-apps/efk/fluentd/templates/fluentd-ds.yml.j2
+++ b/roles/kubernetes-apps/efk/fluentd/templates/fluentd-ds.yml.j2
@@ -1,32 +1,42 @@
---
-# https://raw.githubusercontent.com/kubernetes/kubernetes/v1.5.2/cluster/addons/fluentd-elasticsearch/es-controller.yaml
-apiVersion: extensions/v1beta1
+# https://raw.githubusercontent.com/kubernetes/kubernetes/v1.10.2/cluster/addons/fluentd-elasticsearch/fluentd-es-ds.yaml
+apiVersion: apps/v1
kind: DaemonSet
metadata:
- name: "fluentd-es-v{{ fluentd_version }}"
+ name: "fluentd-es-{{ fluentd_version }}"
namespace: "kube-system"
labels:
k8s-app: fluentd-es
+ version: "{{ fluentd_version }}"
kubernetes.io/cluster-service: "true"
- version: "v{{ fluentd_version }}"
+ addonmanager.kubernetes.io/mode: Reconcile
spec:
+ selector:
+ matchLabels:
+ k8s-app: fluentd-es
+ version: "{{ fluentd_version }}"
template:
metadata:
labels:
k8s-app: fluentd-es
kubernetes.io/cluster-service: "true"
- version: "v{{ fluentd_version }}"
+ version: "{{ fluentd_version }}"
+ # This annotation ensures that fluentd does not get evicted if the node
+ # supports critical pod annotation based priority scheme.
+ # Note that this does not guarantee admission on the nodes (#40573).
+ annotations:
+ scheduler.alpha.kubernetes.io/critical-pod: ''
spec:
- tolerations:
- - effect: NoSchedule
- operator: Exists
+ priorityClassName: system-node-critical
+{% if rbac_enabled %}
+ serviceAccountName: efk
+{% endif %}
containers:
- name: fluentd-es
image: "{{ fluentd_image_repo }}:{{ fluentd_image_tag }}"
- command:
- - '/bin/sh'
- - '-c'
- - '/usr/sbin/td-agent -c {{ fluentd_config_dir }}/{{ fluentd_config_file}} 2>&1 >> /var/log/fluentd.log'
+ env:
+ - name: FLUENTD_ARGS
+ value: "--no-supervisor -q"
resources:
limits:
{% if fluentd_cpu_limit is defined and fluentd_cpu_limit != "0m" %}
@@ -34,27 +44,26 @@ spec:
{% endif %}
memory: {{ fluentd_mem_limit }}
requests:
- cpu: {{ fluentd_cpu_requests }}
+ cpu: {{ fluentd_cpu_requests }}
memory: {{ fluentd_mem_requests }}
volumeMounts:
- name: varlog
mountPath: /var/log
- - name: dockercontainers
+ - name: varlibdockercontainers
mountPath: "{{ docker_daemon_graph }}/containers"
readOnly: true
- - name: config
+ - name: config-volume
mountPath: "{{ fluentd_config_dir }}"
+ nodeSelector:
+ beta.kubernetes.io/fluentd-ds-ready: "true"
terminationGracePeriodSeconds: 30
volumes:
- name: varlog
hostPath:
path: /var/log
- - name: dockercontainers
+ - name: varlibdockercontainers
hostPath:
path: {{ docker_daemon_graph }}/containers
- - name: config
- configMap:
- name: fluentd-config
-{% if rbac_enabled %}
- serviceAccountName: efk
-{% endif %}
+ - name: config-volume
+ configMap:
+ name: fluentd-config
\ No newline at end of file
diff --git a/roles/kubernetes-apps/efk/kibana/defaults/main.yml b/roles/kubernetes-apps/efk/kibana/defaults/main.yml
index 0651a032d..c76e3e710 100644
--- a/roles/kubernetes-apps/efk/kibana/defaults/main.yml
+++ b/roles/kubernetes-apps/efk/kibana/defaults/main.yml
@@ -4,3 +4,4 @@ kibana_mem_limit: 0M
kibana_cpu_requests: 100m
kibana_mem_requests: 0M
kibana_service_port: 5601
+kibana_base_url: "/api/v1/namespaces/kube-system/services/kibana-logging/proxy"
diff --git a/roles/kubernetes-apps/efk/kibana/templates/kibana-deployment.yml.j2 b/roles/kubernetes-apps/efk/kibana/templates/kibana-deployment.yml.j2
index c5603d389..880482d4d 100644
--- a/roles/kubernetes-apps/efk/kibana/templates/kibana-deployment.yml.j2
+++ b/roles/kubernetes-apps/efk/kibana/templates/kibana-deployment.yml.j2
@@ -1,6 +1,6 @@
---
-# https://raw.githubusercontent.com/kubernetes/kubernetes/v1.5.2/cluster/addons/fluentd-kibana/kibana-controller.yaml
-apiVersion: extensions/v1beta1
+# https://raw.githubusercontent.com/kubernetes/kubernetes/release-1.10/cluster/addons/fluentd-elasticsearch/kibana-deployment.yaml
+apiVersion: apps/v1
kind: Deployment
metadata:
name: kibana-logging
@@ -36,10 +36,12 @@ spec:
env:
- name: "ELASTICSEARCH_URL"
value: "http://elasticsearch-logging:{{ elasticsearch_service_port }}"
-{% if kibana_base_url is defined and kibana_base_url != "" %}
- - name: "KIBANA_BASE_URL"
+ - name: "SERVER_BASEPATH"
value: "{{ kibana_base_url }}"
-{% endif %}
+ - name: XPACK_MONITORING_ENABLED
+ value: "false"
+ - name: XPACK_SECURITY_ENABLED
+ value: "false"
ports:
- containerPort: 5601
name: ui