-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
358 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
/* | ||
Agg computes aggregate values over tabular text. | ||
It behaves somewhat like the SQL “GROUP BY” clause. | ||
Usage: | ||
agg [function...] | ||
It reads input from stdin as a sequence of records, one per line. | ||
It treats each line as a set of fields separated by white space. | ||
One field (the first, by default) is designated as the key. | ||
Successive lines with equal keys are grouped into a group, | ||
and agg produces one line of output for each group. | ||
(Note that only contiguous input lines can form a group. | ||
If you need to make sure that all records for a given key | ||
are grouped together, sort the input first.) | ||
For each remaining field, | ||
agg applies a function to all the values in the group, | ||
producing a single output value. | ||
The command line arguments specify which functions to use, | ||
one per field in the input table. | ||
Functions | ||
The available functions are: | ||
key group by this field (default for field 1) | ||
first value from first line of group (default for rest) | ||
last value from last line of group | ||
sample value from any line of group, uniformly at random | ||
prefix longest common string prefix | ||
join:sep concatenate strings with given sep | ||
smin lexically least string | ||
smax lexically greatest string | ||
min numerically least value | ||
max numerically greatest value | ||
sum numeric sum | ||
mean arithmetic mean | ||
count number of records (ignores input value) | ||
const:val print val, ignoring input | ||
drop omit the column entirely | ||
The numeric functions skip items that don't parse as numbers. | ||
Examples | ||
Using the following input: | ||
$ cat >input | ||
-rwx alice 100 /home/alice/bin/crdt | ||
-rw- alice 210002 /home/alice/thesis.tex | ||
-rw- bob 10051 /home/bob/expenses.tab | ||
-rwx kr 862060 /home/kr/bin/blog | ||
-rwx kr 304608 /home/kr/bin/agg | ||
Disk usage for each user, plus where that disk usage occurs | ||
(longest common prefix of filesystem paths): | ||
$ agg <input drop key sum prefix | ||
alice 210153 /home/alice/ | ||
bob 10051 /home/bob/expenses.tab | ||
kr 1166668 /home/kr/ | ||
Disk usage for executable vs non-executable files: | ||
$ sort input | agg key drop sum join:, | ||
-rw- 220053 /home/alice/thesis.tex,/home/bob/expenses.tab | ||
-rwx 1166768 /home/alice/bin/crdt,/home/kr/bin/agg,/home/kr/bin/blog | ||
*/ | ||
package main |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
package main | ||
|
||
// TODO(kr): tests | ||
|
||
import ( | ||
"bufio" | ||
"fmt" | ||
"log" | ||
"math/rand" | ||
"os" | ||
"strings" | ||
"time" | ||
) | ||
|
||
type agg interface { | ||
merge(string) | ||
String() string | ||
} | ||
|
||
var ( | ||
key = 0 | ||
funcmap = make(map[int]func(init, arg string) agg) | ||
argmap = make(map[int]string) | ||
symtab = map[string]func(init, arg string) agg{ | ||
"first": first, | ||
"last": last, | ||
"prefix": prefix, | ||
"sample": sample, | ||
"join": join, | ||
"smin": smin, | ||
"smax": smax, | ||
"min": min, | ||
"max": max, | ||
"sum": sum, | ||
"mean": mean, | ||
"count": count, | ||
"const": constf, | ||
"drop": nil, | ||
} | ||
) | ||
|
||
func main() { | ||
log.SetPrefix("agg: ") | ||
log.SetFlags(0) | ||
rand.Seed(time.Now().UnixNano()) | ||
for i, sym := range os.Args[1:] { | ||
if p := strings.IndexByte(sym, ':'); p >= 0 { | ||
sym, argmap[i] = sym[:p], sym[p+1:] | ||
} | ||
if sym == "key" { | ||
key, sym = i, "first" | ||
} | ||
f, ok := symtab[sym] | ||
if !ok { | ||
log.Fatalf("bad function: %q", sym) | ||
} | ||
funcmap[i] = f | ||
} | ||
|
||
sc := bufio.NewScanner(os.Stdin) | ||
var g *group | ||
for sc.Scan() { | ||
ss := strings.Fields(sc.Text()) | ||
if !matches(g, ss) { | ||
emit(g) | ||
g = &group{key: ss[key]} | ||
} | ||
mergeLine(g, ss) | ||
} | ||
emit(g) | ||
} | ||
|
||
type group struct { | ||
key string | ||
agg []agg | ||
} | ||
|
||
func matches(g *group, ss []string) bool { | ||
return g != nil && g.key == ss[key] | ||
} | ||
|
||
func emit(g *group) { | ||
if g == nil { | ||
return | ||
} | ||
rest := false | ||
for i, a := range g.agg { | ||
if f, ok := funcmap[i]; ok && f == nil { | ||
continue | ||
} | ||
if rest { | ||
fmt.Print("\t") | ||
} | ||
rest = true | ||
fmt.Print(a) | ||
} | ||
fmt.Println() | ||
} | ||
|
||
func mergeLine(g *group, ss []string) { | ||
for i, s := range ss { | ||
if i >= len(g.agg) { | ||
f := funcmap[i] | ||
if f == nil { | ||
f = first | ||
} | ||
g.agg = append(g.agg, f(s, argmap[i])) | ||
} else { | ||
g.agg[i].merge(s) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
package main | ||
|
||
import ( | ||
"math/big" | ||
"strconv" | ||
) | ||
|
||
func min(s, arg string) agg { return newBinop(s, opmin) } | ||
func max(s, arg string) agg { return newBinop(s, opmax) } | ||
func sum(s, arg string) agg { return newBinop(s, opsum) } | ||
|
||
type binop struct { | ||
v *big.Float | ||
f func(a, b *big.Float) *big.Float | ||
} | ||
|
||
func newBinop(s string, f func(a, b *big.Float) *big.Float) *binop { | ||
v, _ := parseFloat(s) | ||
return &binop{v, f} | ||
} | ||
|
||
func (o *binop) String() string { | ||
if o.v == nil { | ||
return "NaN" | ||
} | ||
return o.v.Text('f', -1) | ||
} | ||
|
||
func (o *binop) merge(s string) { | ||
v, ok := parseFloat(s) | ||
if !ok { | ||
return | ||
} | ||
o.v = o.f(o.v, v) | ||
} | ||
|
||
func opmin(a, b *big.Float) *big.Float { | ||
if a != nil && (b == nil || a.Cmp(b) <= 0) { | ||
return a | ||
} | ||
return b | ||
} | ||
|
||
func opmax(a, b *big.Float) *big.Float { | ||
if a != nil && (b == nil || a.Cmp(b) >= 0) { | ||
return a | ||
} | ||
return b | ||
} | ||
|
||
func opsum(a, b *big.Float) *big.Float { | ||
if a == nil { | ||
return b | ||
} else if b == nil { | ||
return a | ||
} | ||
return a.Add(a, b) | ||
} | ||
|
||
type meanagg struct { | ||
v *big.Float | ||
d float64 // actually an integer | ||
} | ||
|
||
func mean(s, arg string) agg { | ||
v, ok := parseFloat(s) | ||
if !ok { | ||
return &meanagg{new(big.Float), 0} | ||
} | ||
return &meanagg{v, 1} | ||
} | ||
|
||
func (m *meanagg) String() string { | ||
if m.d == 0 { | ||
return "NaN" | ||
} | ||
v := new(big.Float).Quo(m.v, big.NewFloat(m.d)) | ||
return v.Text('f', -1) | ||
} | ||
|
||
func (m *meanagg) merge(s string) { | ||
v, ok := parseFloat(s) | ||
if !ok { | ||
return | ||
} | ||
m.v.Add(m.v, v) | ||
m.d++ | ||
} | ||
|
||
func parseFloat(s string) (*big.Float, bool) { | ||
v, _, err := big.ParseFloat(s, 0, 1000, big.ToNearestEven) | ||
return v, err == nil | ||
} | ||
|
||
type counter int | ||
|
||
func count(init, arg string) agg { return new(counter) } | ||
func (c *counter) String() string { return strconv.Itoa(int(*c) + 1) } | ||
func (c *counter) merge(string) { *c++ } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
package main | ||
|
||
import ( | ||
"math/rand" | ||
"strings" | ||
) | ||
|
||
func first(s, arg string) agg { return &sbinop{s, opfirst} } | ||
func last(s, arg string) agg { return &sbinop{s, oplast} } | ||
func prefix(s, arg string) agg { return &sbinop{s, opprefix} } | ||
func join(s, arg string) agg { return &sbinop{s, opjoin(arg)} } | ||
func smin(s, arg string) agg { return &sbinop{s, opsmin} } | ||
func smax(s, arg string) agg { return &sbinop{s, opsmax} } | ||
|
||
type sbinop struct { | ||
s string | ||
f func(a, b string) string | ||
} | ||
|
||
func (o *sbinop) String() string { return o.s } | ||
|
||
func (o *sbinop) merge(s string) { o.s = o.f(o.s, s) } | ||
|
||
func opfirst(a, b string) string { return a } | ||
func oplast(a, b string) string { return b } | ||
|
||
func opprefix(a, b string) string { | ||
for i := range a { | ||
if i >= len(b) || a[i] != b[i] { | ||
return a[:i] | ||
} | ||
} | ||
return a | ||
} | ||
|
||
func opjoin(sep string) func(a, b string) string { | ||
return func(a, b string) string { | ||
return a + sep + b // TODO(kr): too slow? maybe strings.Join? | ||
} | ||
} | ||
|
||
func opsmin(a, b string) string { | ||
if strings.Compare(a, b) <= 0 { | ||
return a | ||
} | ||
return b | ||
} | ||
|
||
func opsmax(a, b string) string { | ||
if strings.Compare(a, b) >= 0 { | ||
return a | ||
} | ||
return b | ||
} | ||
|
||
type sampler struct { | ||
n int | ||
s string | ||
} | ||
|
||
func sample(s, arg string) agg { return &sampler{1, s} } | ||
func (p *sampler) String() string { return p.s } | ||
func (p *sampler) merge(s string) { | ||
p.n++ | ||
if rand.Intn(p.n) == 0 { | ||
p.s = s | ||
} | ||
} | ||
|
||
type constant string | ||
|
||
func constf(init, arg string) agg { return constant(arg) } | ||
func (c constant) String() string { return string(c) } | ||
func (c constant) merge(string) {} |