tsdb/epoch_tracker.go

package tsdb

import (
	"sync"
)

// TODO(jeff): using a mutex is easiest, but there may be a way to do
// this with atomics only, and in a way such that writes are minimally
// blocked.

// epochTracker keeps track of epochs for write and delete operations
// allowing a delete to block until all previous writes have completed.
type epochTracker struct {
	mu      sync.Mutex
	epoch   uint64 // current epoch
	largest uint64 // largest delete possible
	writes  int64  // pending writes
	// pending deletes waiting on writes
	deletes map[uint64]*epochDeleteState
}

// newEpochTracker constructs an epochTracker.
func newEpochTracker() *epochTracker {
	return &epochTracker{
		deletes: make(map[uint64]*epochDeleteState),
	}
}

// epochDeleteState keeps track of the state for a pending delete.
type epochDeleteState struct {
	cond    *sync.Cond
	guard   *guard
	pending int64
}

// done signals that an earlier write has finished.
func (e *epochDeleteState) done() {
	e.cond.L.Lock()
	e.pending--
	if e.pending == 0 {
		e.cond.Broadcast()
	}
	e.cond.L.Unlock()
}

// Wait blocks until all earlier writes have finished.
func (e *epochDeleteState) Wait() {
	e.cond.L.Lock()
	for e.pending > 0 {
		e.cond.Wait()
	}
	e.cond.L.Unlock()
}

// next bumps the epoch and returns it.
func (e *epochTracker) next() uint64 {
	e.epoch++
	return e.epoch
}

// StartWrite should be called before a write is going to start, and after
// it has checked for guards.
func (e *epochTracker) StartWrite() ([]*guard, uint64) {
	e.mu.Lock()
	gen := e.next()
	e.writes++

	if len(e.deletes) == 0 {
		e.mu.Unlock()
		return nil, gen
	}

	guards := make([]*guard, 0, len(e.deletes))
	for _, state := range e.deletes {
		guards = append(guards, state.guard)
	}

	e.mu.Unlock()
	return guards, gen
}

// EndWrite should be called when the write ends for any reason.
func (e *epochTracker) EndWrite(gen uint64) {
	e.mu.Lock()
	if gen <= e.largest {
		// TODO(jeff): at the cost of making waitDelete more
		// complicated, we can keep a sorted slice which would
		// allow this to exit early rather than go over the
		// whole map.
		for dgen, state := range e.deletes {
			if gen > dgen {
				continue
			}
			state.done()
		}
	}
	e.writes--
	e.mu.Unlock()
}

// epochWaiter is a type that can be waited on for prior writes to finish.
type epochWaiter struct {
	gen     uint64
	guard   *guard
	state   *epochDeleteState
	tracker *epochTracker
}

// Wait blocks until all writes prior to the creation of the waiter finish.
func (e epochWaiter) Wait() {
	if e.state == nil || e.tracker == nil {
		return
	}
	e.state.Wait()
}

// Done marks the delete as completed, removing its guard.
func (e epochWaiter) Done() {
	e.tracker.mu.Lock()
	delete(e.tracker.deletes, e.gen)
	e.tracker.mu.Unlock()
	e.guard.Done()
}

// WaitDelete should be called after any delete guards have been installed.
// The returned epochWaiter will not be affected by any future writes.
func (e *epochTracker) WaitDelete(guard *guard) epochWaiter {
	e.mu.Lock()
	state := &epochDeleteState{
		pending: e.writes,
		cond:    sync.NewCond(new(sync.Mutex)),
		guard:   guard,
	}

	// record our pending delete
	gen := e.next()
	e.largest = gen
	e.deletes[gen] = state
	e.mu.Unlock()

	return epochWaiter{
		gen:     gen,
		guard:   guard,
		state:   state,
		tracker: e,
	}
}