diff --git a/cmd/agg/doc.go b/cmd/agg/doc.go new file mode 100644 index 0000000..3a9ec2d --- /dev/null +++ b/cmd/agg/doc.go @@ -0,0 +1,73 @@ +/* + +Agg computes aggregate values over tabular text. +It behaves somewhat like the SQL “GROUP BY” clause. + +Usage: + + agg [function...] + +It reads input from stdin as a sequence of records, one per line. +It treats each line as a set of fields separated by white space. +One field (the first, by default) is designated as the key. +Successive lines with equal keys are grouped into a group, +and agg produces one line of output for each group. +(Note that only contiguous input lines can form a group. +If you need to make sure that all records for a given key +are grouped together, sort the input first.) + +For each remaining field, +agg applies a function to all the values in the group, +producing a single output value. +The command line arguments specify which functions to use, +one per field in the input table. + +Functions + +The available functions are: + + key group by this field (default for field 1) + first value from first line of group (default for rest) + last value from last line of group + sample value from any line of group, uniformly at random + prefix longest common string prefix + join:sep concatenate strings with given sep + smin lexically least string + smax lexically greatest string + min numerically least value + max numerically greatest value + sum numeric sum + mean arithmetic mean + count number of records (ignores input value) + const:val print val, ignoring input + drop omit the column entirely + +The numeric functions skip items that don't parse as numbers. + +Examples + +Using the following input: + + $ cat >input + -rwx alice 100 /home/alice/bin/crdt + -rw- alice 210002 /home/alice/thesis.tex + -rw- bob 10051 /home/bob/expenses.tab + -rwx kr 862060 /home/kr/bin/blog + -rwx kr 304608 /home/kr/bin/agg + +Disk usage for each user, plus where that disk usage occurs +(longest common prefix of filesystem paths): + + $ agg = 0 { + sym, argmap[i] = sym[:p], sym[p+1:] + } + if sym == "key" { + key, sym = i, "first" + } + f, ok := symtab[sym] + if !ok { + log.Fatalf("bad function: %q", sym) + } + funcmap[i] = f + } + + sc := bufio.NewScanner(os.Stdin) + var g *group + for sc.Scan() { + ss := strings.Fields(sc.Text()) + if !matches(g, ss) { + emit(g) + g = &group{key: ss[key]} + } + mergeLine(g, ss) + } + emit(g) +} + +type group struct { + key string + agg []agg +} + +func matches(g *group, ss []string) bool { + return g != nil && g.key == ss[key] +} + +func emit(g *group) { + if g == nil { + return + } + rest := false + for i, a := range g.agg { + if f, ok := funcmap[i]; ok && f == nil { + continue + } + if rest { + fmt.Print("\t") + } + rest = true + fmt.Print(a) + } + fmt.Println() +} + +func mergeLine(g *group, ss []string) { + for i, s := range ss { + if i >= len(g.agg) { + f := funcmap[i] + if f == nil { + f = first + } + g.agg = append(g.agg, f(s, argmap[i])) + } else { + g.agg[i].merge(s) + } + } +} diff --git a/cmd/agg/num.go b/cmd/agg/num.go new file mode 100644 index 0000000..93ac3fe --- /dev/null +++ b/cmd/agg/num.go @@ -0,0 +1,99 @@ +package main + +import ( + "math/big" + "strconv" +) + +func min(s, arg string) agg { return newBinop(s, opmin) } +func max(s, arg string) agg { return newBinop(s, opmax) } +func sum(s, arg string) agg { return newBinop(s, opsum) } + +type binop struct { + v *big.Float + f func(a, b *big.Float) *big.Float +} + +func newBinop(s string, f func(a, b *big.Float) *big.Float) *binop { + v, _ := parseFloat(s) + return &binop{v, f} +} + +func (o *binop) String() string { + if o.v == nil { + return "NaN" + } + return o.v.Text('f', -1) +} + +func (o *binop) merge(s string) { + v, ok := parseFloat(s) + if !ok { + return + } + o.v = o.f(o.v, v) +} + +func opmin(a, b *big.Float) *big.Float { + if a != nil && (b == nil || a.Cmp(b) <= 0) { + return a + } + return b +} + +func opmax(a, b *big.Float) *big.Float { + if a != nil && (b == nil || a.Cmp(b) >= 0) { + return a + } + return b +} + +func opsum(a, b *big.Float) *big.Float { + if a == nil { + return b + } else if b == nil { + return a + } + return a.Add(a, b) +} + +type meanagg struct { + v *big.Float + d float64 // actually an integer +} + +func mean(s, arg string) agg { + v, ok := parseFloat(s) + if !ok { + return &meanagg{new(big.Float), 0} + } + return &meanagg{v, 1} +} + +func (m *meanagg) String() string { + if m.d == 0 { + return "NaN" + } + v := new(big.Float).Quo(m.v, big.NewFloat(m.d)) + return v.Text('f', -1) +} + +func (m *meanagg) merge(s string) { + v, ok := parseFloat(s) + if !ok { + return + } + m.v.Add(m.v, v) + m.d++ +} + +func parseFloat(s string) (*big.Float, bool) { + v, _, err := big.ParseFloat(s, 0, 1000, big.ToNearestEven) + return v, err == nil +} + +type counter int + +func count(init, arg string) agg { return new(counter) } +func (c *counter) String() string { return strconv.Itoa(int(*c) + 1) } +func (c *counter) merge(string) { *c++ } diff --git a/cmd/agg/string.go b/cmd/agg/string.go new file mode 100644 index 0000000..9a8cf78 --- /dev/null +++ b/cmd/agg/string.go @@ -0,0 +1,74 @@ +package main + +import ( + "math/rand" + "strings" +) + +func first(s, arg string) agg { return &sbinop{s, opfirst} } +func last(s, arg string) agg { return &sbinop{s, oplast} } +func prefix(s, arg string) agg { return &sbinop{s, opprefix} } +func join(s, arg string) agg { return &sbinop{s, opjoin(arg)} } +func smin(s, arg string) agg { return &sbinop{s, opsmin} } +func smax(s, arg string) agg { return &sbinop{s, opsmax} } + +type sbinop struct { + s string + f func(a, b string) string +} + +func (o *sbinop) String() string { return o.s } + +func (o *sbinop) merge(s string) { o.s = o.f(o.s, s) } + +func opfirst(a, b string) string { return a } +func oplast(a, b string) string { return b } + +func opprefix(a, b string) string { + for i := range a { + if i >= len(b) || a[i] != b[i] { + return a[:i] + } + } + return a +} + +func opjoin(sep string) func(a, b string) string { + return func(a, b string) string { + return a + sep + b // TODO(kr): too slow? maybe strings.Join? + } +} + +func opsmin(a, b string) string { + if strings.Compare(a, b) <= 0 { + return a + } + return b +} + +func opsmax(a, b string) string { + if strings.Compare(a, b) >= 0 { + return a + } + return b +} + +type sampler struct { + n int + s string +} + +func sample(s, arg string) agg { return &sampler{1, s} } +func (p *sampler) String() string { return p.s } +func (p *sampler) merge(s string) { + p.n++ + if rand.Intn(p.n) == 0 { + p.s = s + } +} + +type constant string + +func constf(init, arg string) agg { return constant(arg) } +func (c constant) String() string { return string(c) } +func (c constant) merge(string) {} diff --git a/colwriter/column_test.go b/colwriter/column_test.go index 8d0bf8f..ce388f5 100644 --- a/colwriter/column_test.go +++ b/colwriter/column_test.go @@ -36,7 +36,7 @@ version.go windows.go `[1:] -var tests = []struct{ +var tests = []struct { wid int flag uint src string diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..fa0528b --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module "github.com/kr/text" + +require "github.com/kr/pty" v1.1.1 diff --git a/wrap.go b/wrap.go old mode 100755 new mode 100644 index ca88565..b09bb03 --- a/wrap.go +++ b/wrap.go @@ -31,7 +31,7 @@ func WrapBytes(b []byte, lim int) []byte { // WrapWords is the low-level line-breaking algorithm, useful if you need more // control over the details of the text wrapping process. For most uses, either -// Wrap or WrapBytes will be sufficient and more convenient. +// Wrap or WrapBytes will be sufficient and more convenient. // // WrapWords splits a list of words into lines with minimal "raggedness", // treating each byte as one unit, accounting for spc units between adjacent @@ -58,7 +58,7 @@ func WrapWords(words [][]byte, spc, lim, pen int) [][][]byte { cost[i] = math.MaxInt32 } for i := n - 1; i >= 0; i-- { - if length[i][n-1] <= lim { + if length[i][n-1] <= lim || i == n-1 { cost[i] = 0 nbrk[i] = n } else { diff --git a/wrap_test.go b/wrap_test.go index 90f065c..634b6e8 100644 --- a/wrap_test.go +++ b/wrap_test.go @@ -42,3 +42,21 @@ func TestWrapOneLine(t *testing.T) { t.Fail() } } + +func TestWrapBug1(t *testing.T) { + cases := []struct { + limit int + text string + want string + }{ + {4, "aaaaa", "aaaaa"}, + {4, "a aaaaa", "a\naaaaa"}, + } + + for _, test := range cases { + got := Wrap(test.text, test.limit) + if got != test.want { + t.Errorf("Wrap(%q, %d) = %q want %q", test.text, test.limit, got, test.want) + } + } +}