|
|
- // zk helper functions
- // modified from Vitess project
-
- package zkhelper
-
- import (
- "encoding/json"
- "errors"
- "fmt"
- "math/rand"
- "os"
- "path"
- "sort"
- "strings"
- "sync"
- "time"
-
- "github.com/ngaut/go-zookeeper/zk"
- "github.com/ngaut/log"
- )
-
- var (
- // This error is returned by functions that wait for a result
- // when they are interrupted.
- ErrInterrupted = errors.New("zkutil: obtaining lock was interrupted")
-
- // This error is returned by functions that wait for a result
- // when the timeout value is reached.
- ErrTimeout = errors.New("zkutil: obtaining lock timed out")
- )
-
- const (
- // PERM_DIRECTORY are default permissions for a node.
- PERM_DIRECTORY = zk.PermAdmin | zk.PermCreate | zk.PermDelete | zk.PermRead | zk.PermWrite
- // PERM_FILE allows a zk node to emulate file behavior by disallowing child nodes.
- PERM_FILE = zk.PermAdmin | zk.PermRead | zk.PermWrite
- MagicPrefix = "zk"
- )
-
- func init() {
- rand.Seed(time.Now().UnixNano())
- }
-
- type MyZkConn struct {
- *zk.Conn
- }
-
- func (conn *MyZkConn) Seq2Str(seq int64) string {
- return fmt.Sprintf("%0.10d", seq)
- }
-
- func ConnectToZk(zkAddr string) (Conn, error) {
- zkConn, _, err := zk.Connect(strings.Split(zkAddr, ","), 3*time.Second)
- if err != nil {
- return nil, err
- }
-
- return &MyZkConn{Conn: zkConn}, nil
- }
-
- func ConnectToZkWithTimeout(zkAddr string, recvTime time.Duration) (Conn, error) {
- zkConn, _, err := zk.Connect(strings.Split(zkAddr, ","), recvTime)
- if err != nil {
- return nil, err
- }
-
- return &MyZkConn{Conn: zkConn}, nil
- }
-
- func DefaultACLs() []zk.ACL {
- return zk.WorldACL(zk.PermAll)
- }
-
- func DefaultDirACLs() []zk.ACL {
- return zk.WorldACL(PERM_DIRECTORY)
- }
-
- func DefaultFileACLs() []zk.ACL {
- return zk.WorldACL(PERM_FILE)
- }
-
- // IsDirectory returns if this node should be treated as a directory.
- func IsDirectory(aclv []zk.ACL) bool {
- for _, acl := range aclv {
- if acl.Perms != PERM_DIRECTORY {
- return false
- }
- }
- return true
- }
-
- func ZkErrorEqual(a, b error) bool {
- if a != nil && b != nil {
- return a.Error() == b.Error()
- }
-
- return a == b
- }
-
- // Create a path and any pieces required, think mkdir -p.
- // Intermediate znodes are always created empty.
- func CreateRecursive(zconn Conn, zkPath, value string, flags int, aclv []zk.ACL) (pathCreated string, err error) {
- parts := strings.Split(zkPath, "/")
- if parts[1] != MagicPrefix {
- return "", fmt.Errorf("zkutil: non /%v path: %v", MagicPrefix, zkPath)
- }
-
- pathCreated, err = zconn.Create(zkPath, []byte(value), int32(flags), aclv)
- if ZkErrorEqual(err, zk.ErrNoNode) {
- // Make sure that nodes are either "file" or "directory" to mirror file system
- // semantics.
- dirAclv := make([]zk.ACL, len(aclv))
- for i, acl := range aclv {
- dirAclv[i] = acl
- dirAclv[i].Perms = PERM_DIRECTORY
- }
- _, err = CreateRecursive(zconn, path.Dir(zkPath), "", flags, dirAclv)
- if err != nil && !ZkErrorEqual(err, zk.ErrNodeExists) {
- return "", err
- }
- pathCreated, err = zconn.Create(zkPath, []byte(value), int32(flags), aclv)
- }
- return
- }
-
- func CreateOrUpdate(zconn Conn, zkPath, value string, flags int, aclv []zk.ACL, recursive bool) (pathCreated string, err error) {
- if recursive {
- pathCreated, err = CreateRecursive(zconn, zkPath, value, 0, aclv)
- } else {
- pathCreated, err = zconn.Create(zkPath, []byte(value), 0, aclv)
- }
- if err != nil && ZkErrorEqual(err, zk.ErrNodeExists) {
- pathCreated = ""
- _, err = zconn.Set(zkPath, []byte(value), -1)
- }
- return
- }
-
- type pathItem struct {
- path string
- err error
- }
-
- func ChildrenRecursive(zconn Conn, zkPath string) ([]string, error) {
- var err error
- mutex := sync.Mutex{}
- wg := sync.WaitGroup{}
- pathList := make([]string, 0, 32)
- children, _, err := zconn.Children(zkPath)
- if err != nil {
- return nil, err
- }
-
- for _, child := range children {
- wg.Add(1)
- go func(child string) {
- childPath := path.Join(zkPath, child)
- rChildren, zkErr := ChildrenRecursive(zconn, childPath)
- if zkErr != nil {
- // If other processes are deleting nodes, we need to ignore
- // the missing nodes.
- if !ZkErrorEqual(zkErr, zk.ErrNoNode) {
- mutex.Lock()
- err = zkErr
- mutex.Unlock()
- }
- } else {
- mutex.Lock()
- pathList = append(pathList, child)
- for _, rChild := range rChildren {
- pathList = append(pathList, path.Join(child, rChild))
- }
- mutex.Unlock()
- }
- wg.Done()
- }(child)
- }
-
- wg.Wait()
-
- mutex.Lock()
- defer mutex.Unlock()
- if err != nil {
- return nil, err
- }
- return pathList, nil
- }
-
- func HasWildcard(path string) bool {
- for i := 0; i < len(path); i++ {
- switch path[i] {
- case '\\':
- if i+1 >= len(path) {
- return true
- } else {
- i++
- }
- case '*', '?', '[':
- return true
- }
- }
- return false
- }
-
- func resolveRecursive(zconn Conn, parts []string, toplevel bool) ([]string, error) {
- for i, part := range parts {
- if HasWildcard(part) {
- var children []string
- zkParentPath := strings.Join(parts[:i], "/")
- var err error
- children, _, err = zconn.Children(zkParentPath)
- if err != nil {
- // we asked for something like
- // /zk/cell/aaa/* and
- // /zk/cell/aaa doesn't exist
- // -> return empty list, no error
- // (note we check both a regular zk
- // error and the error the test
- // produces)
- if ZkErrorEqual(err, zk.ErrNoNode) {
- return nil, nil
- }
- // otherwise we return the error
- return nil, err
- }
- sort.Strings(children)
-
- results := make([][]string, len(children))
- wg := &sync.WaitGroup{}
- mu := &sync.Mutex{}
- var firstError error
-
- for j, child := range children {
- matched, err := path.Match(part, child)
- if err != nil {
- return nil, err
- }
- if matched {
- // we have a match!
- wg.Add(1)
- newParts := make([]string, len(parts))
- copy(newParts, parts)
- newParts[i] = child
- go func(j int) {
- defer wg.Done()
- subResult, err := resolveRecursive(zconn, newParts, false)
- if err != nil {
- mu.Lock()
- if firstError != nil {
- log.Infof("Multiple error: %v", err)
- } else {
- firstError = err
- }
- mu.Unlock()
- } else {
- results[j] = subResult
- }
- }(j)
- }
- }
-
- wg.Wait()
- if firstError != nil {
- return nil, firstError
- }
-
- result := make([]string, 0, 32)
- for j := 0; j < len(children); j++ {
- subResult := results[j]
- if subResult != nil {
- result = append(result, subResult...)
- }
- }
-
- // we found a part that is a wildcard, we
- // added the children already, we're done
- return result, nil
- }
- }
-
- // no part contains a wildcard, add the path if it exists, and done
- path := strings.Join(parts, "/")
- if toplevel {
- // for whatever the user typed at the toplevel, we don't
- // check it exists or not, we just return it
- return []string{path}, nil
- }
-
- // this is an expanded path, we need to check if it exists
- _, stat, err := zconn.Exists(path)
- if err != nil {
- return nil, err
- }
- if stat != nil {
- return []string{path}, nil
- }
- return nil, nil
- }
-
- // resolve paths like:
- // /zk/nyc/vt/tablets/*/action
- // /zk/global/vt/keyspaces/*/shards/*/action
- // /zk/*/vt/tablets/*/action
- // into real existing paths
- //
- // If you send paths that don't contain any wildcard and
- // don't exist, this function will return an empty array.
- func ResolveWildcards(zconn Conn, zkPaths []string) ([]string, error) {
- // check all the paths start with /zk/ before doing anything
- // time consuming
- // relax this in case we are not talking to a metaconn and
- // just want to talk to a specified instance.
- // for _, zkPath := range zkPaths {
- // if _, err := ZkCellFromZkPath(zkPath); err != nil {
- // return nil, err
- // }
- // }
-
- results := make([][]string, len(zkPaths))
- wg := &sync.WaitGroup{}
- mu := &sync.Mutex{}
- var firstError error
-
- for i, zkPath := range zkPaths {
- wg.Add(1)
- parts := strings.Split(zkPath, "/")
- go func(i int) {
- defer wg.Done()
- subResult, err := resolveRecursive(zconn, parts, true)
- if err != nil {
- mu.Lock()
- if firstError != nil {
- log.Infof("Multiple error: %v", err)
- } else {
- firstError = err
- }
- mu.Unlock()
- } else {
- results[i] = subResult
- }
- }(i)
- }
-
- wg.Wait()
- if firstError != nil {
- return nil, firstError
- }
-
- result := make([]string, 0, 32)
- for i := 0; i < len(zkPaths); i++ {
- subResult := results[i]
- if subResult != nil {
- result = append(result, subResult...)
- }
- }
-
- return result, nil
- }
-
- func DeleteRecursive(zconn Conn, zkPath string, version int) error {
- // version: -1 delete any version of the node at path - only applies to the top node
- err := zconn.Delete(zkPath, int32(version))
- if err == nil {
- return nil
- }
- if !ZkErrorEqual(err, zk.ErrNotEmpty) {
- return err
- }
- // Remove the ability for other nodes to get created while we are trying to delete.
- // Otherwise, you can enter a race condition, or get starved out from deleting.
- _, err = zconn.SetACL(zkPath, zk.WorldACL(zk.PermAdmin|zk.PermDelete|zk.PermRead), int32(version))
- if err != nil {
- return err
- }
- children, _, err := zconn.Children(zkPath)
- if err != nil {
- return err
- }
- for _, child := range children {
- err := DeleteRecursive(zconn, path.Join(zkPath, child), -1)
- if err != nil && !ZkErrorEqual(err, zk.ErrNoNode) {
- return fmt.Errorf("zkutil: recursive delete failed: %v", err)
- }
- }
-
- err = zconn.Delete(zkPath, int32(version))
- if err != nil && !ZkErrorEqual(err, zk.ErrNotEmpty) {
- err = fmt.Errorf("zkutil: nodes getting recreated underneath delete (app race condition): %v", zkPath)
- }
- return err
- }
-
- // The lexically lowest node is the lock holder - verify that this
- // path holds the lock. Call this queue-lock because the semantics are
- // a hybrid. Normal zk locks make assumptions about sequential
- // numbering that don't hold when the data in a lock is modified.
- // if the provided 'interrupted' chan is closed, we'll just stop waiting
- // and return an interruption error
- func ObtainQueueLock(zconn Conn, zkPath string, wait time.Duration, interrupted chan struct{}) error {
- queueNode := path.Dir(zkPath)
- lockNode := path.Base(zkPath)
-
- timer := time.NewTimer(wait)
- trylock:
- children, _, err := zconn.Children(queueNode)
- if err != nil {
- return fmt.Errorf("zkutil: trylock failed %v", err)
- }
- sort.Strings(children)
- if len(children) > 0 {
- if children[0] == lockNode {
- return nil
- }
- if wait > 0 {
- prevLock := ""
- for i := 1; i < len(children); i++ {
- if children[i] == lockNode {
- prevLock = children[i-1]
- break
- }
- }
- if prevLock == "" {
- return fmt.Errorf("zkutil: no previous queue node found: %v", zkPath)
- }
-
- zkPrevLock := path.Join(queueNode, prevLock)
- _, stat, watch, err := zconn.ExistsW(zkPrevLock)
- if err != nil {
- return fmt.Errorf("zkutil: unable to watch queued node %v %v", zkPrevLock, err)
- }
- if stat == nil {
- goto trylock
- }
- select {
- case <-timer.C:
- break
- case <-interrupted:
- return ErrInterrupted
- case <-watch:
- // The precise event doesn't matter - try to read again regardless.
- goto trylock
- }
- }
- return ErrTimeout
- }
- return fmt.Errorf("zkutil: empty queue node: %v", queueNode)
- }
-
- func ZkEventOk(e zk.Event) bool {
- return e.State == zk.StateConnected
- }
-
- func NodeExists(zconn Conn, zkPath string) (bool, error) {
- b, _, err := zconn.Exists(zkPath)
- return b, err
- }
-
- // Close the release channel when you want to clean up nicely.
- func CreatePidNode(zconn Conn, zkPath string, contents string, done chan struct{}) error {
- // On the first try, assume the cluster is up and running, that will
- // help hunt down any config issues present at startup
- if _, err := zconn.Create(zkPath, []byte(contents), zk.FlagEphemeral, zk.WorldACL(PERM_FILE)); err != nil {
- if ZkErrorEqual(err, zk.ErrNodeExists) {
- err = zconn.Delete(zkPath, -1)
- }
- if err != nil {
- return fmt.Errorf("zkutil: failed deleting pid node: %v: %v", zkPath, err)
- }
- _, err = zconn.Create(zkPath, []byte(contents), zk.FlagEphemeral, zk.WorldACL(PERM_FILE))
- if err != nil {
- return fmt.Errorf("zkutil: failed creating pid node: %v: %v", zkPath, err)
- }
- }
-
- go func() {
- for {
- _, _, watch, err := zconn.GetW(zkPath)
- if err != nil {
- if ZkErrorEqual(err, zk.ErrNoNode) {
- _, err = zconn.Create(zkPath, []byte(contents), zk.FlagEphemeral, zk.WorldACL(zk.PermAll))
- if err != nil {
- log.Warningf("failed recreating pid node: %v: %v", zkPath, err)
- } else {
- log.Infof("recreated pid node: %v", zkPath)
- continue
- }
- } else {
- log.Warningf("failed reading pid node: %v", err)
- }
- } else {
- select {
- case event := <-watch:
- if ZkEventOk(event) && event.Type == zk.EventNodeDeleted {
- // Most likely another process has started up. However,
- // there is a chance that an ephemeral node is deleted by
- // the session expiring, yet that same session gets a watch
- // notification. This seems like buggy behavior, but rather
- // than race too hard on the node, just wait a bit and see
- // if the situation resolves itself.
- log.Warningf("pid deleted: %v", zkPath)
- } else {
- log.Infof("pid node event: %v", event)
- }
- // break here and wait for a bit before attempting
- case <-done:
- log.Infof("pid watcher stopped on done: %v", zkPath)
- return
- }
- }
- select {
- // No one likes a thundering herd, least of all zk.
- case <-time.After(5*time.Second + time.Duration(rand.Int63n(55e9))):
- case <-done:
- log.Infof("pid watcher stopped on done: %v", zkPath)
- return
- }
- }
- }()
-
- return nil
- }
-
- // ZLocker is an interface for a lock that can fail.
- type ZLocker interface {
- Lock(desc string) error
- LockWithTimeout(wait time.Duration, desc string) error
- Unlock() error
- Interrupt()
- }
-
- // Experiment with a little bit of abstraction.
- // FIMXE(msolo) This object may need a mutex to ensure it can be shared
- // across goroutines.
- type zMutex struct {
- mu sync.Mutex
- zconn Conn
- path string // Path under which we try to create lock nodes.
- contents string
- interrupted chan struct{}
- name string // The name of the specific lock node we created.
- ephemeral bool
- }
-
- // CreateMutex initializes an unaquired mutex. A mutex is released only
- // by Unlock. You can clean up a mutex with delete, but you should be
- // careful doing so.
- func CreateMutex(zconn Conn, zkPath string) ZLocker {
- zm, err := CreateMutexWithContents(zconn, zkPath, map[string]interface{}{})
- if err != nil {
- panic(err) // should never happen
- }
- return zm
- }
-
- // CreateMutex initializes an unaquired mutex with special content for this mutex.
- // A mutex is released only by Unlock. You can clean up a mutex with delete, but you should be
- // careful doing so.
- func CreateMutexWithContents(zconn Conn, zkPath string, contents map[string]interface{}) (ZLocker, error) {
- hostname, err := os.Hostname()
- if err != nil {
- return nil, err
- }
- pid := os.Getpid()
- contents["hostname"] = hostname
- contents["pid"] = pid
-
- data, err := json.Marshal(contents)
- if err != nil {
- return nil, err
- }
-
- return &zMutex{zconn: zconn, path: zkPath, contents: string(data), interrupted: make(chan struct{})}, nil
- }
-
- // Interrupt releases a lock that's held.
- func (zm *zMutex) Interrupt() {
- select {
- case zm.interrupted <- struct{}{}:
- default:
- log.Warningf("zmutex interrupt blocked")
- }
- }
-
- // Lock returns nil when the lock is acquired.
- func (zm *zMutex) Lock(desc string) error {
- return zm.LockWithTimeout(365*24*time.Hour, desc)
- }
-
- // LockWithTimeout returns nil when the lock is acquired. A lock is
- // held if the file exists and you are the creator. Setting the wait
- // to zero makes this a nonblocking lock check.
- //
- // FIXME(msolo) Disallow non-super users from removing the lock?
- func (zm *zMutex) LockWithTimeout(wait time.Duration, desc string) (err error) {
- timer := time.NewTimer(wait)
- defer func() {
- if panicErr := recover(); panicErr != nil || err != nil {
- zm.deleteLock()
- }
- }()
- // Ensure the rendezvous node is here.
- // FIXME(msolo) Assuming locks are contended, it will be cheaper to assume this just
- // exists.
- _, err = CreateRecursive(zm.zconn, zm.path, "", 0, zk.WorldACL(PERM_DIRECTORY))
- if err != nil && !ZkErrorEqual(err, zk.ErrNodeExists) {
- return err
- }
-
- lockPrefix := path.Join(zm.path, "lock-")
- zflags := zk.FlagSequence
- if zm.ephemeral {
- zflags = zflags | zk.FlagEphemeral
- }
-
- // update node content
- var lockContent map[string]interface{}
- err = json.Unmarshal([]byte(zm.contents), &lockContent)
- if err != nil {
- return err
- }
- lockContent["desc"] = desc
- newContent, err := json.Marshal(lockContent)
- if err != nil {
- return err
- }
-
- createlock:
- lockCreated, err := zm.zconn.Create(lockPrefix, newContent, int32(zflags), zk.WorldACL(PERM_FILE))
- if err != nil {
- return err
- }
- name := path.Base(lockCreated)
- zm.mu.Lock()
- zm.name = name
- zm.mu.Unlock()
-
- trylock:
- children, _, err := zm.zconn.Children(zm.path)
- if err != nil {
- return fmt.Errorf("zkutil: trylock failed %v", err)
- }
- sort.Strings(children)
- if len(children) == 0 {
- return fmt.Errorf("zkutil: empty lock: %v", zm.path)
- }
-
- if children[0] == name {
- // We are the lock owner.
- return nil
- }
-
- // This is the degenerate case of a nonblocking lock check. It's not optimal, but
- // also probably not worth optimizing.
- if wait == 0 {
- return ErrTimeout
- }
- prevLock := ""
- for i := 1; i < len(children); i++ {
- if children[i] == name {
- prevLock = children[i-1]
- break
- }
- }
- if prevLock == "" {
- // This is an interesting case. The node disappeared
- // underneath us, probably due to a session loss. We can
- // recreate the lock node (with a new sequence number) and
- // keep trying.
- log.Warningf("zkutil: no lock node found: %v/%v", zm.path, zm.name)
- goto createlock
- }
-
- zkPrevLock := path.Join(zm.path, prevLock)
- exist, stat, watch, err := zm.zconn.ExistsW(zkPrevLock)
- if err != nil {
- // FIXME(msolo) Should this be a retry?
- return fmt.Errorf("zkutil: unable to watch previous lock node %v %v", zkPrevLock, err)
- }
- if stat == nil || !exist {
- goto trylock
- }
- select {
- case <-timer.C:
- return ErrTimeout
- case <-zm.interrupted:
- return ErrInterrupted
- case event := <-watch:
- log.Infof("zkutil: lock event: %v", event)
- // The precise event doesn't matter - try to read again regardless.
- goto trylock
- }
- panic("unexpected")
- }
-
- // Unlock returns nil if the lock was successfully
- // released. Otherwise, it is most likely a zk related error.
- func (zm *zMutex) Unlock() error {
- return zm.deleteLock()
- }
-
- func (zm *zMutex) deleteLock() error {
- zm.mu.Lock()
- zpath := path.Join(zm.path, zm.name)
- zm.mu.Unlock()
-
- err := zm.zconn.Delete(zpath, -1)
- if err != nil && !ZkErrorEqual(err, zk.ErrNoNode) {
- return err
- }
- return nil
- }
-
- // ZElector stores basic state for running an election.
- type ZElector struct {
- *zMutex
- path string
- leader string
- }
-
- func (ze *ZElector) isLeader() bool {
- return ze.leader == ze.name
- }
-
- type electionEvent struct {
- Event int
- Err error
- }
-
- type backoffDelay struct {
- min time.Duration
- max time.Duration
- delay time.Duration
- }
-
- func newBackoffDelay(min, max time.Duration) *backoffDelay {
- return &backoffDelay{min, max, min}
- }
-
- func (bd *backoffDelay) NextDelay() time.Duration {
- delay := bd.delay
- bd.delay = 2 * bd.delay
- if bd.delay > bd.max {
- bd.delay = bd.max
- }
- return delay
- }
-
- func (bd *backoffDelay) Reset() {
- bd.delay = bd.min
- }
-
- // ElectorTask is the interface for a task that runs essentially
- // forever or until something bad happens. If a task must be stopped,
- // it should be handled promptly - no second notification will be
- // sent.
- type ElectorTask interface {
- Run() error
- Stop()
- // Return true if interrupted, false if it died of natural causes.
- // An interrupted task indicates that the election should stop.
- Interrupted() bool
- }
-
- // CreateElection returns an initialized elector. An election is
- // really a cycle of events. You are flip-flopping between leader and
- // candidate. It's better to think of this as a stream of events that
- // one needs to react to.
- func CreateElection(zconn Conn, zkPath string) ZElector {
- zm, err := CreateElectionWithContents(zconn, zkPath, map[string]interface{}{})
- if err != nil {
- // should never happend
- panic(err)
- }
- return zm
- }
-
- // CreateElection returns an initialized elector with special contents. An election is
- // really a cycle of events. You are flip-flopping between leader and
- // candidate. It's better to think of this as a stream of events that
- // one needs to react to.
- func CreateElectionWithContents(zconn Conn, zkPath string, contents map[string]interface{}) (ZElector, error) {
- l, err := CreateMutexWithContents(zconn, path.Join(zkPath, "candidates"), contents)
- if err != nil {
- return ZElector{}, err
- }
- zm := l.(*zMutex)
- zm.ephemeral = true
- return ZElector{zMutex: zm, path: zkPath}, nil
- }
-
- // RunTask returns nil when the underlyingtask ends or the error it
- // generated.
- func (ze *ZElector) RunTask(task ElectorTask) error {
- delay := newBackoffDelay(100*time.Millisecond, 1*time.Minute)
- leaderPath := path.Join(ze.path, "leader")
- for {
- _, err := CreateRecursive(ze.zconn, leaderPath, "", 0, zk.WorldACL(PERM_FILE))
- if err == nil || ZkErrorEqual(err, zk.ErrNodeExists) {
- break
- }
- log.Warningf("election leader create failed: %v", err)
- time.Sleep(delay.NextDelay())
- }
-
- for {
- err := ze.Lock("RunTask")
- if err != nil {
- log.Warningf("election lock failed: %v", err)
- if err == ErrInterrupted {
- return ErrInterrupted
- }
- continue
- }
- // Confirm your win and deliver acceptance speech. This notifies
- // listeners who will have been watching the leader node for
- // changes.
- _, err = ze.zconn.Set(leaderPath, []byte(ze.contents), -1)
- if err != nil {
- log.Warningf("election promotion failed: %v", err)
- continue
- }
-
- log.Infof("election promote leader %v", leaderPath)
- taskErrChan := make(chan error)
- go func() {
- taskErrChan <- task.Run()
- }()
-
- watchLeader:
- // Watch the leader so we can get notified if something goes wrong.
- data, _, watch, err := ze.zconn.GetW(leaderPath)
- if err != nil {
- log.Warningf("election unable to watch leader node %v %v", leaderPath, err)
- // FIXME(msolo) Add delay
- goto watchLeader
- }
-
- if string(data) != ze.contents {
- log.Warningf("election unable to promote leader")
- task.Stop()
- // We won the election, but we didn't become the leader. How is that possible?
- // (see Bush v. Gore for some inspiration)
- // It means:
- // 1. Someone isn't playing by the election rules (a bad actor).
- // Hard to detect - let's assume we don't have this problem. :)
- // 2. We lost our connection somehow and the ephemeral lock was cleared,
- // allowing someone else to win the election.
- continue
- }
-
- // This is where we start our target process and watch for its failure.
- waitForEvent:
- select {
- case <-ze.interrupted:
- log.Warning("election interrupted - stop child process")
- task.Stop()
- // Once the process dies from the signal, this will all tear down.
- goto waitForEvent
- case taskErr := <-taskErrChan:
- // If our code fails, unlock to trigger an election.
- log.Infof("election child process ended: %v", taskErr)
- ze.Unlock()
- if task.Interrupted() {
- log.Warningf("election child process interrupted - stepping down")
- return ErrInterrupted
- }
- continue
- case zevent := <-watch:
- // We had a zk connection hiccup. We have a few choices,
- // but it depends on the constraints and the events.
- //
- // If we get SESSION_EXPIRED our connection loss triggered an
- // election that we won't have won and the thus the lock was
- // automatically freed. We have no choice but to start over.
- if zevent.State == zk.StateExpired {
- log.Warningf("election leader watch expired")
- task.Stop()
- continue
- }
-
- // Otherwise, we had an intermittent issue or something touched
- // the node. Either we lost our position or someone broke
- // protocol and touched the leader node. We just reconnect and
- // revalidate. In the meantime, assume we are still the leader
- // until we determine otherwise.
- //
- // On a reconnect we will be able to see the leader
- // information. If we still hold the position, great. If not, we
- // kill the associated process.
- //
- // On a leader node change, we need to perform the same
- // validation. It's possible an election completes without the
- // old leader realizing he is out of touch.
- log.Warningf("election leader watch event %v", zevent)
- goto watchLeader
- }
- }
- panic("unreachable")
- }
|