From ad2d3d4c205b679e7f87cb7ae6d7d5ee737558b7 Mon Sep 17 00:00:00 2001 From: Prashanth Balasubramanian Date: Sun, 22 Nov 2015 16:06:04 -0800 Subject: [PATCH] Docs etc --- cluster/gce/config-default.sh | 1 - cluster/gce/config-test.sh | 1 - cluster/gce/util.sh | 21 +-- cluster/saltbase/salt/flannel-server/init.sls | 2 +- cluster/saltbase/salt/flannel/network.json | 8 - cluster/saltbase/salt/top.sls | 3 - cmd/kubelet/app/server.go | 12 +- docs/admin/kubelet.md | 3 +- docs/proposals/flannel-integration.md | 164 ++++++++++++++++-- hack/verify-flags/known-flags.txt | 1 + .../{flannel_server.go => flannel_helper.go} | 16 ++ pkg/kubelet/kubelet.go | 18 +- 12 files changed, 177 insertions(+), 73 deletions(-) delete mode 100644 cluster/saltbase/salt/flannel/network.json rename pkg/kubelet/{flannel_server.go => flannel_helper.go} (88%) diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index b6bcf8ba2725e..8fd2fa8422ff6 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -120,4 +120,3 @@ OPENCONTRAIL_PUBLIC_SUBNET="${OPENCONTRAIL_PUBLIC_SUBNET:-10.1.0.0/16}" # Optional: if set to true, kube-up will configure the cluster to run e2e tests. E2E_STORAGE_TEST_ENVIRONMENT=${KUBE_E2E_STORAGE_TEST_ENVIRONMENT:-false} -FIREWALL_ETCD="${FIREWALL_SSH:-${NETWORK}-allow-etcd}" diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index 78b1cd69fd200..5eb4046ecc786 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -134,4 +134,3 @@ OPENCONTRAIL_PUBLIC_SUBNET="${OPENCONTRAIL_PUBLIC_SUBNET:-10.1.0.0/16}" E2E_STORAGE_TEST_ENVIRONMENT=${KUBE_E2E_STORAGE_TEST_ENVIRONMENT:-false} # Overlay network settings OVERLAY_NETWORK=${OVERLAY_NETWORK:-true} -FIREWALL_ETCD="${FIREWALL_SSH:-${NETWORK}-allow-etcd}" diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh index 9e45f27fad54b..655221c9d165c 100755 --- a/cluster/gce/util.sh +++ b/cluster/gce/util.sh @@ -304,7 +304,7 @@ function create-static-ip { echo -e "${color_red}Failed to create static ip $1 ${color_norm}" >&2 exit 2 fi - attempt=$(($attempt+1)) + attempt=$(($attempt+1)) echo -e "${color_yellow}Attempt $attempt failed to create static ip $1. Retrying.${color_norm}" >&2 sleep $(($attempt * 5)) else @@ -603,28 +603,13 @@ function kube-up { --allow "tcp:22" & fi + echo "Starting master and configuring firewalls" gcloud compute firewall-rules create "${MASTER_NAME}-https" \ --project "${PROJECT}" \ --network "${NETWORK}" \ --target-tags "${MASTER_TAG}" \ --allow tcp:443 & - if [[ "${OVERLAY_NETWORK}" == "true" ]]; then - # TODO: Where to put this? Scope it to flannel setup. - if ! "${GCLOUD}" compute firewall-rules --project "${PROJECT}" describe "${FIREWALL_ETCD}" &>/dev/null; then - "${GCLOUD}" compute firewall-rules create "${FIREWALL_ETCD}" \ - --network="${NETWORK}" \ - --project="${PROJECT}" \ - --source-ranges="10.0.0.0/8" \ - --target-tags "${MINION_TAG}" \ - --allow tcp:4001 & - else - echo "... Using etcd firewall-rule: ${FIREWALL_ETCD}" >&2 - fi - else - echo "Not opening etcd up to the cluster: ${OVERLAY_NETWORK} ${FIREWALL_ETCD}" - fi - # We have to make sure the disk is created before creating the master VM, so # run this in the foreground. gcloud compute disks create "${MASTER_NAME}-pd" \ @@ -687,7 +672,7 @@ function kube-up { write-node-env local template_name="${NODE_INSTANCE_PREFIX}-template" - + create-node-instance-template $template_name gcloud compute instance-groups managed \ diff --git a/cluster/saltbase/salt/flannel-server/init.sls b/cluster/saltbase/salt/flannel-server/init.sls index 154d943adb852..a5b1d2e66c72f 100644 --- a/cluster/saltbase/salt/flannel-server/init.sls +++ b/cluster/saltbase/salt/flannel-server/init.sls @@ -8,7 +8,7 @@ touch /var/log/etcd_flannel.log: /etc/kubernetes/network.json: file.managed: - - source: salt://flannel/network.json + - source: salt://flannel-server/network.json - makedirs: True - user: root - group: root diff --git a/cluster/saltbase/salt/flannel/network.json b/cluster/saltbase/salt/flannel/network.json deleted file mode 100644 index c8d8e0788b8ba..0000000000000 --- a/cluster/saltbase/salt/flannel/network.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "Network": "18.16.0.0/16", - "SubnetLen": 24, - "Backend": { - "Type": "vxlan", - "VNI": 1 - } -} diff --git a/cluster/saltbase/salt/top.sls b/cluster/saltbase/salt/top.sls index 5c40a36e44a7e..45294498427b3 100644 --- a/cluster/saltbase/salt/top.sls +++ b/cluster/saltbase/salt/top.sls @@ -14,9 +14,6 @@ base: - match: grain - docker - flannel -{% if grains['cloud'] is defined and grains['cloud'] == 'azure' %} - - openvpn-client -{% endif %} - helpers - cadvisor - kube-client-tools diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index 2bc3e34cf6cf2..63bc35546d72b 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -159,10 +159,7 @@ type KubeletServer struct { // Pull images one at a time. SerializeImagePulls bool - - // Flannel config parameters - UseDefaultOverlay bool - NetworkConfig string + UseDefaultOverlay bool } // bootstrapping interface for kubelet, targets the initialization protocol @@ -237,7 +234,6 @@ func NewKubeletServer() *KubeletServer { KubeAPIBurst: 10, // Flannel parameters UseDefaultOverlay: useDefaultOverlay, - // NetworkConfig: networkConfig, } } @@ -355,7 +351,6 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) { // Flannel config parameters fs.BoolVar(&s.UseDefaultOverlay, "use-default-overlay", s.UseDefaultOverlay, "Experimental support for starting the kubelet with the default overlay network (flannel). Assumes flanneld is already running in client mode. [default=false]") - fs.StringVar(&s.NetworkConfig, "network-config", s.NetworkConfig, "Absolute path to a network json file, as accepted by flannel.") } // UnsecuredKubeletConfig returns a KubeletConfig suitable for being run, or an error if the server setup @@ -494,9 +489,7 @@ func (s *KubeletServer) UnsecuredKubeletConfig() (*KubeletConfig, error) { Writer: writer, VolumePlugins: ProbeVolumePlugins(), - // Flannel options UseDefaultOverlay: s.UseDefaultOverlay, - NetworkConfig: s.NetworkConfig, }, nil } @@ -969,9 +962,7 @@ type KubeletConfig struct { Writer io.Writer VolumePlugins []volume.VolumePlugin - // Flannel parameters UseDefaultOverlay bool - NetworkConfig string } func CreateAndInitKubelet(kc *KubeletConfig) (k KubeletBootstrap, pc *config.PodConfig, err error) { @@ -1056,7 +1047,6 @@ func CreateAndInitKubelet(kc *KubeletConfig) (k KubeletBootstrap, pc *config.Pod kc.ContainerManager, // Flannel parameters kc.UseDefaultOverlay, - //kc.NetworkConfig, ) if err != nil { diff --git a/docs/admin/kubelet.md b/docs/admin/kubelet.md index 44946ad848b12..00f32a89985a3 100644 --- a/docs/admin/kubelet.md +++ b/docs/admin/kubelet.md @@ -137,9 +137,10 @@ kubelet --system-container="": Optional resource-only container in which to place all non-kernel processes that are not already in a container. Empty for no container. Rolling back the flag requires a reboot. (Default: ""). --tls-cert-file="": File containing x509 Certificate for HTTPS. (CA cert, if any, concatenated after server cert). If --tls-cert-file and --tls-private-key-file are not provided, a self-signed certificate and key are generated for the public address and saved to the directory passed to --cert-dir. --tls-private-key-file="": File containing x509 private key matching --tls-cert-file. + --use-default-overlay[=true]: Experimental support for starting the kubelet with the default overlay network (flannel). Assumes flanneld is already running in client mode. [default=false] ``` -###### Auto generated by spf13/cobra on 21-Nov-2015 +###### Auto generated by spf13/cobra on 23-Nov-2015 diff --git a/docs/proposals/flannel-integration.md b/docs/proposals/flannel-integration.md index 5f33ec3076d7a..417cab1d36420 100644 --- a/docs/proposals/flannel-integration.md +++ b/docs/proposals/flannel-integration.md @@ -1,35 +1,165 @@ + + + + +WARNING +WARNING +WARNING +WARNING +WARNING + +

PLEASE NOTE: This document applies to the HEAD of the source tree

+ +If you are using a released version of Kubernetes, you should +refer to the docs that go with that version. + + +The latest release of this document can be found +[here](http://releases.k8s.io/release-1.1/docs/proposals/flannel-integration.md). + +Documentation for other releases can be found at +[releases.k8s.io](http://releases.k8s.io). + +-- + + + + + # Flannel integration with Kubernetes ## Why? * Networking works out of the box. -* Cloud gateway configuration is regulated. +* Cloud gateway configuration is regulated by quota. * Consistent bare metal and cloud experience. * Lays foundation for integrating with networking backends and vendors. -# How? +## How? + +Thus: + +``` +Master | Node1 +---------------------------------------------------------------------- +{192.168.0.0/16, 256 /24} | docker + | | | restart with podcidr +apiserver <------------------ kubelet (sends podcidr) + | | | here's podcidr, mtu +flannel-server:10253 <------------------ flannel-daemon +Allocates a /24 ------------------> [config iptables, VXLan] + <------------------ [watch subnet leases] +I just allocated ------------------> [config VXLan] +another /24 | +``` + +## Proposal + +Explaining vxlan is out of the scope of this document, however it does take some basic understanding to grok the proposal. Assume some pod wants to communicate across nodes with the above setup. Check the flannel vxlan devices: + +```console +node1 $ ip -d link show flannel.1 +4: flannel.1: mtu 1410 qdisc noqueue state UNKNOWN mode DEFAULT + link/ether a2:53:86:b5:5f:c1 brd ff:ff:ff:ff:ff:ff + vxlan +node1 $ ip -d link show eth0 +2: eth0: mtu 1460 qdisc mq state UP mode DEFAULT qlen 1000 + link/ether 42:01:0a:f0:00:04 brd ff:ff:ff:ff:ff:ff + +node2 $ ip -d link show flannel.1 +4: flannel.1: mtu 1410 qdisc noqueue state UNKNOWN mode DEFAULT + link/ether 56:71:35:66:4a:d8 brd ff:ff:ff:ff:ff:ff + vxlan +node2 $ ip -d link show eth0 +2: eth0: mtu 1460 qdisc mq state UP mode DEFAULT qlen 1000 + link/ether 42:01:0a:f0:00:03 brd ff:ff:ff:ff:ff:ff +``` + +Note that we're ignoring cbr0 for the sake of simplicity. Spin-up a container on each node. We're using raw docker for this example only because we want control over where the container lands: ``` -Master Node1 ----------------------|-------------------------------- -database | - | | -{10.250.0.0/16} | docker - | here's podcidr |restart with podcidr -apiserver <------------------- kubelet - | | |here's podcidr -flannel-server:10253 <------- flannel-daemon - --/16---> - <--watch-- [config iptables] - subscribe to new node subnets - --------> [config VXLan] - | +node1 $ docker run -it radial/busyboxplus:curl /bin/sh +[ root@5ca3c154cde3:/ ]$ ip addr show +1: lo: mtu 65536 qdisc noqueue +8: eth0: mtu 1410 qdisc noqueue + link/ether 02:42:12:10:20:03 brd ff:ff:ff:ff:ff:ff + inet 192.168.32.3/24 scope global eth0 + valid_lft forever preferred_lft forever + +node2 $ docker run -it radial/busyboxplus:curl /bin/sh +[ root@d8a879a29f5d:/ ]$ ip addr show +1: lo: mtu 65536 qdisc noqueue +16: eth0: mtu 1410 qdisc noqueue + link/ether 02:42:12:10:0e:07 brd ff:ff:ff:ff:ff:ff + inet 192.168.14.7/24 scope global eth0 + valid_lft forever preferred_lft forever +[ root@d8a879a29f5d:/ ]$ ping 192.168.32.3 +PING 192.168.32.3 (192.168.32.3): 56 data bytes +64 bytes from 192.168.32.3: seq=0 ttl=62 time=1.190 ms ``` -There is a tiny lie in the above diagram, as of now, the flannel server on the master maintains a private etcd. This will not be necessary once we have a generalized network resource, and a Kubernetes x flannel backend. +__What happened?__: + +From 1000 feet: +* vxlan device driver starts up on node1 and creates a udp tunnel endpoint on 8472 +* container 192.168.32.3 pings 192.168.14.7 + - what's the MAC of 192.168.14.0? + - L2 miss, flannel looks up MAC of subnet + - Stores `192.168.14.0 <-> 56:71:35:66:4a:d8` in neighbor table + - what's tunnel endpoint of this MAC? + - L3 miss, flannel looks up destination VM ip + - Stores `10.240.0.3 <-> 56:71:35:66:4a:d8` in bridge database +* Sends `[56:71:35:66:4a:d8, 10.240.0.3][vxlan: port, vni][02:42:12:10:20:03, 192.168.14.7][icmp]` + +__But will it blend?__ + +Kubernetes integration is fairly straight-forward once we understand the pieces involved, and can be prioritized as follows: +* Kubelet understands flannel daemon in client mode, flannel server manages independent etcd store on master, node controller backs off cidr allocation +* Flannel server consults the Kubernetes master for everything network related +* Flannel daemon works through network plugins in a generic way without bothering the kubelet: needs CNI x Kubernetes standardization + +The first is accomplished in this PR, while a timeline for 2. and 3. are TDB. To implement the flannel api we can either run a proxy per node and get rid of the flannel server, or service all requests in the flannel server with something like a go-routine per node: +* `/network/config`: read network configuration and return +* `/network/leases`: + - Post: Return a lease as understood by flannel + - Lookip node by IP + - Store node metadata from [flannel request] (https://github.com/coreos/flannel/blob/master/subnet/subnet.go#L34) in annotations + - Return [Lease object] (https://github.com/coreos/flannel/blob/master/subnet/subnet.go#L40) reflecting node cidr + - Get: Handle a watch on leases +* `/network/leases/subnet`: + - Put: This is a request for a lease. If the nodecontroller is allocating CIDRs we can probably just no-op. +* `/network/reservations`: TDB, we can probably use this to accomodate node controller allocating CIDR instead of flannel requesting it + +The ick-iest part of this implementation is going to the the `GET /network/leases`, i.e the watch proxy. We can side-step by waiting for a more generic Kubernetes resource. However, we can also implement it as follows: +* Watch all nodes, ignore heartbeats +* On each change, figure out the lease for the node, construct a [lease watch result](https://github.com/coreos/flannel/blob/0bf263826eab1707be5262703a8092c7d15e0be4/subnet/subnet.go#L72), and send it down the watch with the RV from the node +* Implement a lease list that does a similar translation + +I say this is gross without an api objet because for each node->lease translation one has to store and retrieve the node metadata sent by flannel (eg: VTEP) from node annotations. [Reference implementation](https://github.com/bprashanth/kubernetes/blob/network_vxlan/pkg/kubelet/flannel_server.go) and [watch proxy](https://github.com/bprashanth/kubernetes/blob/network_vxlan/pkg/kubelet/watch_proxy.go). # Limitations * Integration is experimental +* Flannel etcd not stored in persistent disk +* CIDR allocation does *not* flow from Kubernetes down to nodes anymore # Wishlist + +This proposal is really just a call for community help in writing a Kubernetes x flannel backend. + +* CNI plugin integration +* Flannel daemon in privileged pod +* Flannel server talks to apiserver, described in proposal above +* HTTPs between flannel daemon/server +* Investigate flannel server runing on every node (as done in the reference implementation mentioned above) +* Use flannel reservation mode to support node controller podcidr alloction + + + +[![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/docs/proposals/flannel-integration.md?pixel)]() + diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt index 2bcb625923a7f..2472f4bc1aa02 100644 --- a/hack/verify-flags/known-flags.txt +++ b/hack/verify-flags/known-flags.txt @@ -327,3 +327,4 @@ watch-only whitelist-override-label windows-line-endings www-prefix +use-default-overlay diff --git a/pkg/kubelet/flannel_server.go b/pkg/kubelet/flannel_helper.go similarity index 88% rename from pkg/kubelet/flannel_server.go rename to pkg/kubelet/flannel_helper.go index d485a84c7e2d7..91a16e8c36936 100644 --- a/pkg/kubelet/flannel_server.go +++ b/pkg/kubelet/flannel_helper.go @@ -1,3 +1,19 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package kubelet import ( diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 349103a13f8c2..8e45b69c15971 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -328,11 +328,8 @@ func NewMainKubelet( cpuCFSQuota: cpuCFSQuota, daemonEndpoints: daemonEndpoints, containerManager: containerManager, - - // Flannel options - // TODO: This is currently a dummy server. - flannelHelper: NewFlannelHelper(), - useDefaultOverlay: useDefaultOverlay, + flannelHelper: NewFlannelHelper(), + useDefaultOverlay: useDefaultOverlay, } if klet.kubeClient == nil { glog.Infof("Master not setting up flannel overlay") @@ -659,7 +656,6 @@ type Kubelet struct { // oneTimeInitializer is used to initialize modules that are dependent on the runtime to be up. oneTimeInitializer sync.Once - // Flannel options. useDefaultOverlay bool flannelHelper *FlannelHelper } @@ -1129,7 +1125,6 @@ func (kl *Kubelet) syncNodeStatus() { } if kl.registerNode { // This will exit immediately if it doesn't need to do anything. - glog.Infof("(kubelet) registering node with apiserver") kl.registerWithApiserver() } if err := kl.updateNodeStatus(); err != nil { @@ -2588,10 +2583,10 @@ func (kl *Kubelet) updateRuntimeUp() { func (kl *Kubelet) reconcileCBR0(podCIDR string) error { if podCIDR == "" { - glog.V(1).Info("(kubelet) PodCIDR not set. Will not configure cbr0.") + glog.V(5).Info("PodCIDR not set. Will not configure cbr0.") return nil } - glog.V(1).Infof("(kubelet) PodCIDR is set to %q", podCIDR) + glog.V(5).Infof("PodCIDR is set to %q", podCIDR) _, cidr, err := net.ParseCIDR(podCIDR) if err != nil { return err @@ -2634,13 +2629,12 @@ func (kl *Kubelet) syncNetworkStatus() { var err error if kl.configureCBR0 { if kl.useDefaultOverlay { - glog.Infof("(kubelet) handshaking") podCIDR, err := kl.flannelHelper.Handshake() if err != nil { glog.Infof("Flannel server handshake failed %v", err) return } - glog.Infof("(kubelet) setting cidr, currently: %v -> %v", + glog.Infof("Setting cidr: %v -> %v", kl.runtimeState.podCIDR(), podCIDR) kl.runtimeState.setPodCIDR(podCIDR) } @@ -2915,7 +2909,7 @@ func (kl *Kubelet) tryUpdateNodeStatus() error { } else if kl.reconcileCIDR { kl.runtimeState.setPodCIDR(node.Spec.PodCIDR) } - glog.Infof("(kubelet) updating node in apiserver with cidr %v", node.Spec.PodCIDR) + glog.Infof("Updating node in apiserver with cidr %v", node.Spec.PodCIDR) if err := kl.setNodeStatus(node); err != nil { return err