Skip to content

Commit

Permalink
cmd/agg: new command
Browse files Browse the repository at this point in the history
  • Loading branch information
kr committed May 4, 2016
1 parent bb797dc commit 7cafcd8
Show file tree
Hide file tree
Showing 4 changed files with 358 additions and 0 deletions.
73 changes: 73 additions & 0 deletions cmd/agg/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
Agg computes aggregate values over tabular text.
It behaves somewhat like the SQL “GROUP BY” clause.
Usage:
agg [function...]
It reads input from stdin as a sequence of records, one per line.
It treats each line as a set of fields separated by white space.
One field (the first, by default) is designated as the key.
Successive lines with equal keys are grouped into a group,
and agg produces one line of output for each group.
(Note that only contiguous input lines can form a group.
If you need to make sure that all records for a given key
are grouped together, sort the input first.)
For each remaining field,
agg applies a function to all the values in the group,
producing a single output value.
The command line arguments specify which functions to use,
one per field in the input table.
Functions
The available functions are:
key group by this field (default for field 1)
first value from first line of group (default for rest)
last value from last line of group
sample value from any line of group, uniformly at random
prefix longest common string prefix
join:sep concatenate strings with given sep
smin lexically least string
smax lexically greatest string
min numerically least value
max numerically greatest value
sum numeric sum
mean arithmetic mean
count number of records (ignores input value)
const:val print val, ignoring input
drop omit the column entirely
The numeric functions skip items that don't parse as numbers.
Examples
Using the following input:
$ cat >input
-rwx alice 100 /home/alice/bin/crdt
-rw- alice 210002 /home/alice/thesis.tex
-rw- bob 10051 /home/bob/expenses.tab
-rwx kr 862060 /home/kr/bin/blog
-rwx kr 304608 /home/kr/bin/agg
Disk usage for each user, plus where that disk usage occurs
(longest common prefix of filesystem paths):
$ agg <input drop key sum prefix
alice 210153 /home/alice/
bob 10051 /home/bob/expenses.tab
kr 1166668 /home/kr/
Disk usage for executable vs non-executable files:
$ sort input | agg key drop sum join:,
-rw- 220053 /home/alice/thesis.tex,/home/bob/expenses.tab
-rwx 1166768 /home/alice/bin/crdt,/home/kr/bin/agg,/home/kr/bin/blog
*/
package main
112 changes: 112 additions & 0 deletions cmd/agg/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
package main

// TODO(kr): tests

import (
"bufio"
"fmt"
"log"
"math/rand"
"os"
"strings"
"time"
)

type agg interface {
merge(string)
String() string
}

var (
key = 0
funcmap = make(map[int]func(init, arg string) agg)
argmap = make(map[int]string)
symtab = map[string]func(init, arg string) agg{
"first": first,
"last": last,
"prefix": prefix,
"sample": sample,
"join": join,
"smin": smin,
"smax": smax,
"min": min,
"max": max,
"sum": sum,
"mean": mean,
"count": count,
"const": constf,
"drop": nil,
}
)

func main() {
log.SetPrefix("agg: ")
log.SetFlags(0)
rand.Seed(time.Now().UnixNano())
for i, sym := range os.Args[1:] {
if p := strings.IndexByte(sym, ':'); p >= 0 {
sym, argmap[i] = sym[:p], sym[p+1:]
}
if sym == "key" {
key, sym = i, "first"
}
f, ok := symtab[sym]
if !ok {
log.Fatalf("bad function: %q", sym)
}
funcmap[i] = f
}

sc := bufio.NewScanner(os.Stdin)
var g *group
for sc.Scan() {
ss := strings.Fields(sc.Text())
if !matches(g, ss) {
emit(g)
g = &group{key: ss[key]}
}
mergeLine(g, ss)
}
emit(g)
}

type group struct {
key string
agg []agg
}

func matches(g *group, ss []string) bool {
return g != nil && g.key == ss[key]
}

func emit(g *group) {
if g == nil {
return
}
rest := false
for i, a := range g.agg {
if f, ok := funcmap[i]; ok && f == nil {
continue
}
if rest {
fmt.Print("\t")
}
rest = true
fmt.Print(a)
}
fmt.Println()
}

func mergeLine(g *group, ss []string) {
for i, s := range ss {
if i >= len(g.agg) {
f := funcmap[i]
if f == nil {
f = first
}
g.agg = append(g.agg, f(s, argmap[i]))
} else {
g.agg[i].merge(s)
}
}
}
99 changes: 99 additions & 0 deletions cmd/agg/num.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package main

import (
"math/big"
"strconv"
)

func min(s, arg string) agg { return newBinop(s, opmin) }
func max(s, arg string) agg { return newBinop(s, opmax) }
func sum(s, arg string) agg { return newBinop(s, opsum) }

type binop struct {
v *big.Float
f func(a, b *big.Float) *big.Float
}

func newBinop(s string, f func(a, b *big.Float) *big.Float) *binop {
v, _ := parseFloat(s)
return &binop{v, f}
}

func (o *binop) String() string {
if o.v == nil {
return "NaN"
}
return o.v.Text('f', -1)
}

func (o *binop) merge(s string) {
v, ok := parseFloat(s)
if !ok {
return
}
o.v = o.f(o.v, v)
}

func opmin(a, b *big.Float) *big.Float {
if a != nil && (b == nil || a.Cmp(b) <= 0) {
return a
}
return b
}

func opmax(a, b *big.Float) *big.Float {
if a != nil && (b == nil || a.Cmp(b) >= 0) {
return a
}
return b
}

func opsum(a, b *big.Float) *big.Float {
if a == nil {
return b
} else if b == nil {
return a
}
return a.Add(a, b)
}

type meanagg struct {
v *big.Float
d float64 // actually an integer
}

func mean(s, arg string) agg {
v, ok := parseFloat(s)
if !ok {
return &meanagg{new(big.Float), 0}
}
return &meanagg{v, 1}
}

func (m *meanagg) String() string {
if m.d == 0 {
return "NaN"
}
v := new(big.Float).Quo(m.v, big.NewFloat(m.d))
return v.Text('f', -1)
}

func (m *meanagg) merge(s string) {
v, ok := parseFloat(s)
if !ok {
return
}
m.v.Add(m.v, v)
m.d++
}

func parseFloat(s string) (*big.Float, bool) {
v, _, err := big.ParseFloat(s, 0, 1000, big.ToNearestEven)
return v, err == nil
}

type counter int

func count(init, arg string) agg { return new(counter) }
func (c *counter) String() string { return strconv.Itoa(int(*c) + 1) }
func (c *counter) merge(string) { *c++ }
74 changes: 74 additions & 0 deletions cmd/agg/string.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package main

import (
"math/rand"
"strings"
)

func first(s, arg string) agg { return &sbinop{s, opfirst} }
func last(s, arg string) agg { return &sbinop{s, oplast} }
func prefix(s, arg string) agg { return &sbinop{s, opprefix} }
func join(s, arg string) agg { return &sbinop{s, opjoin(arg)} }
func smin(s, arg string) agg { return &sbinop{s, opsmin} }
func smax(s, arg string) agg { return &sbinop{s, opsmax} }

type sbinop struct {
s string
f func(a, b string) string
}

func (o *sbinop) String() string { return o.s }

func (o *sbinop) merge(s string) { o.s = o.f(o.s, s) }

func opfirst(a, b string) string { return a }
func oplast(a, b string) string { return b }

func opprefix(a, b string) string {
for i := range a {
if i >= len(b) || a[i] != b[i] {
return a[:i]
}
}
return a
}

func opjoin(sep string) func(a, b string) string {
return func(a, b string) string {
return a + sep + b // TODO(kr): too slow? maybe strings.Join?
}
}

func opsmin(a, b string) string {
if strings.Compare(a, b) <= 0 {
return a
}
return b
}

func opsmax(a, b string) string {
if strings.Compare(a, b) >= 0 {
return a
}
return b
}

type sampler struct {
n int
s string
}

func sample(s, arg string) agg { return &sampler{1, s} }
func (p *sampler) String() string { return p.s }
func (p *sampler) merge(s string) {
p.n++
if rand.Intn(p.n) == 0 {
p.s = s
}
}

type constant string

func constf(init, arg string) agg { return constant(arg) }
func (c constant) String() string { return string(c) }
func (c constant) merge(string) {}

0 comments on commit 7cafcd8

Please sign in to comment.