Skip to content

Commit

Permalink
add platform: allimg
Browse files Browse the repository at this point in the history
  • Loading branch information
drmingdrmer committed Feb 5, 2021
1 parent c6bf3a5 commit 81f6a1b
Show file tree
Hide file tree
Showing 19 changed files with 356 additions and 17 deletions.
2 changes: 1 addition & 1 deletion install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ python setup.py sdist bdist_wheel
pip install dist/*.tar.gz
)

PYTHONPATH="$(cd ..; pwd)" pytest
PYTHONPATH="$(cd ..; pwd)" pytest -x
67 changes: 54 additions & 13 deletions md2zhihu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,22 +80,31 @@ def code_to_jpg(mdrender, n, width=None, ctx=None):
return [r'<img src="{}" />'.format(mdrender.conf.img_url(fn)), '']

def code_mermaid_to_jpg(mdrender, n, ctx=None):
return typ_text_to_jpg(mdrender, 'mermaid', n['text'])

# strip last \n
d = k3down2.convert('mermaid', n['text'][:-1], 'jpg')
fn = asset_fn(n['text'], 'jpg')
def typ_text_to_jpg(mdrender, typ, txt, opt=None):
d = k3down2.convert(typ, txt, 'jpg', opt=opt)
fn = asset_fn(txt, 'jpg')
fwrite(mdrender.conf.output_dir, fn, d)

return [r'![]({})'.format(mdrender.conf.img_url(fn)), '']


def math_block_to_imgtag(n, ctx=None):
def math_block_to_imgtag(mdrender, n, ctx=None):
return [k3down2.tex_to_zhihu(n['text'], True)]

def math_inline_to_imgtag(n, ctx=None):
def math_inline_to_imgtag(mdrender, n, ctx=None):
return [k3down2.tex_to_zhihu(n['text'], False)]

def math_inline_to_plaintext(n, ctx=None):

def math_block_to_jpg(mdrender, n, ctx=None):
return typ_text_to_jpg(mdrender, 'tex_block', n['text'])

def math_inline_to_jpg(mdrender, n, ctx=None):
return typ_text_to_jpg(mdrender, 'tex_inline', n['text'])


def math_inline_to_plaintext(mdrender, n, ctx=None):
return [escape(k3down2.convert('tex_inline', n['text'], 'plain'))]

def table_to_barehtml(mdrender, n, ctx=None):
Expand Down Expand Up @@ -147,10 +156,10 @@ def zhihu_specific(mdrender, n, ctx=None):
return image_local_to_remote(mdrender, n, ctx=ctx)

if typ == 'math_block':
return math_block_to_imgtag(n, ctx=ctx)
return math_block_to_imgtag(mdrender, n, ctx=ctx)

if typ == 'math_inline':
return math_inline_to_imgtag(n, ctx=ctx)
return math_inline_to_imgtag(mdrender, n, ctx=ctx)

if typ == 'table':
return table_to_barehtml(mdrender, n, ctx=ctx)
Expand All @@ -170,10 +179,10 @@ def wechat_specific(mdrender, n, ctx=None):
return image_local_to_remote(mdrender, n, ctx=ctx)

if typ == 'math_block':
return math_block_to_imgtag(n, ctx=ctx)
return math_block_to_imgtag(mdrender, n, ctx=ctx)

if typ == 'math_inline':
return math_inline_to_imgtag(n, ctx=ctx)
return math_inline_to_imgtag(mdrender, n, ctx=ctx)

if typ == 'table':
return table_to_barehtml(mdrender, n, ctx=ctx)
Expand All @@ -197,10 +206,10 @@ def weibo_specific(mdrender, n, ctx=None):
return image_local_to_remote(mdrender, n, ctx=ctx)

if typ == 'math_block':
return math_block_to_imgtag(n, ctx=ctx)
return math_block_to_imgtag(mdrender, n, ctx=ctx)

if typ == 'math_inline':
return math_inline_to_plaintext(n, ctx=ctx)
return math_inline_to_plaintext(mdrender, n, ctx=ctx)

if typ == 'table':
return table_to_jpg(mdrender, n, ctx=ctx)
Expand Down Expand Up @@ -240,13 +249,42 @@ def weibo_specific(mdrender, n, ctx=None):
return None


def allimg_specific(mdrender, n, ctx=None):
typ = n['type']

if typ == 'image':
return image_local_to_remote(mdrender, n, ctx=ctx)

if typ == 'math_block':
return math_block_to_jpg(mdrender, n, ctx=ctx)

if typ == 'math_inline':
return math_inline_to_jpg(mdrender, n, ctx=ctx)

if typ == 'table':
return table_to_jpg(mdrender, n, ctx=ctx)

if typ == 'block_code':
lang = n['info'] or ''
if lang == 'mermaid':
return code_mermaid_to_jpg(mdrender, n, ctx=ctx)

if lang == '':
return code_to_jpg(mdrender, n, ctx=ctx)
else:
return code_to_jpg(mdrender, n, width=600, ctx=ctx)

return None


class MDRender(object):

# platform specific renderer
platforms = {
'zhihu': zhihu_specific,
'wechat':wechat_specific,
'weibo':weibo_specific,
'allimg': allimg_specific,
}

def __init__(self, conf, platform='zhihu'):
Expand Down Expand Up @@ -392,6 +430,9 @@ def render(self, nodes, ctx=None):

return rst

def msg(self, *args):
msg(*args)


def fix_tables(nodes):
"""
Expand Down Expand Up @@ -828,8 +869,8 @@ def main():
parser.add_argument('-p', '--platform', action='store',
required=False,
default='zhihu',
choices=["zhihu", "wechat", "weibo", "allimg"],
help='convert to a platform compatible format.'
' Supported platform: "zhihu", "wechat"'
)

parser.add_argument('--keep-meta', action='store_true',
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ PyYAML~=5.3.1
k3down2~=0.1.13
k3handy
k3color~=0.1.2
k3fs~=0.1.5
Binary file added test/data/allimg/src/assets/slim.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
177 changes: 177 additions & 0 deletions test/data/allimg/src/simple.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
---

refs:
- "slim": https://github.com/openacid/slim "slim"
- "slimarray": https://github.com/openacid/slimarray "slimarray"
- "vlink": https://vlink "vlink"

platform_refs:
zhihu:
- "vlink": https://vlink.zhihu "vlink"

---

# 场景和问题

| | md源文件 | 导入知乎的效果 |
|:-- | :-: | :-: |
|使用前 | a | c |
|转换后 | b | d |

```mermaid
graph LR
A[Hard edge] -->|Link text| B(Round edge)
B --> C{Decision}
C -->|One| D[Result one]
C -->|Two| E[Result two]
```

inline code: `foo = bar`

inline math $$ ||X{\vec {\beta }}-Y||^{2} $$ foo

inline math in codespan ` $$ ||X{\vec {\beta }}-Y||^{2} $$ `
![](assets/slim.jpg)

在时序数据库, 或列存储为基础的系统中, 很常见的形式就是存储一个整数数组,
例如 [slim] 这个项目按天统计的 star 数:

![](assets/slim.jpg)
![](/src/assets/slim.jpg)


我们可以利用数据分布的特点, 将整体数据的大小压缩到**几分之一**.

| Data size | Data Set | gzip size | slimarry size | avg size | ratio |
| --: | :-- | --: | :-- | --: | --: |
| 1,000 | rand u32: [0, 1000] | x | 824 byte | 6 bit/elt | 18% |
| 1,000,000 | rand u32: [0, 1000,000] | x | 702 KB | 5 bit/elt | 15% |
| 1,000,000 | IPv4 DB | 2 MB | 2 MB | 16 bit/elt | 50% |
| 600 | [slim][] star count | 602 byte | 832 byte | 10 bit/elt | 26% |

在达到gzip同等压缩率的前提下, 构建 slimarray 和 访问的性能也非常高:
- 构建 slimarray 时, 平均每秒可压缩 6百万 个数组元素;
- 读取一个数组元素平均花费 7 ns/op.
- 构建 slimarray 时, 平均每秒可压缩 6百万 个数组元素;
- 读取一个数组元素平均花费 `7 ns/op`.

> 在达到gzip同等压缩率的前提下, 构建 slimarray 和 访问的性能也非常高:
> - 构建 slimarray 时, 平均每秒可压缩 6百万 个数组元素;
> - 读取一个数组元素平均花费 7 ns/op.
> - 构建 slimarray 时, 平均每秒可压缩 6百万 个数组元素;
> - 读取一个数组元素平均花费 `7 ns/op`.

按照这种思路, **在给定数组中找到一条曲线来描述点的趋势,**
**再用一个比较小的delta数组修正曲线到实际点的距离, 得到原始值, 就可以实现大幅度的数据压缩. 而且所有的数据都无需解压全部数据就直接读取任意一个.**

# 找到趋势函数

寻找这样一条曲线就使用线性回归,
例如在 [slimarray] 中使用2次曲线 `f(x) = β₁ + β₂x + β₃x²`, 所要做的就是确定每个βᵢ的值,
以使得`f(xⱼ) - yⱼ`的均方差最小. xⱼ是数组下标0, 1, 2...; yⱼ是数组中每个元素的值.

$$
X = \begin{bmatrix}
1 & x_1 & x_1^2 \\
1 & x_2 & x_2^2 \\
\vdots & \vdots & \vdots \\
1 & x_n & x_n^2
\end{bmatrix}
,
\vec{\beta} =
\begin{bmatrix}
\beta_1 \\
\beta_2 \\
\beta_3 \\
\end{bmatrix}
,
Y =
\begin{bmatrix}
y_1 \\
y_2 \\
\vdots \\
y_n
\end{bmatrix}
$$


`spanIndex = OnesCount(bitmap & (1<<(i/16) - 1))`

## 读取过程

读取过程通过找span, 读取span配置,还原原始数据几个步骤完成, 假设 slimarray 的对象是`sa`:

- 通过下标`i` 得到 spanIndex: `spanIndex = OnesCount(sa.bitmap & (1<<(i/16) - 1))`;
- 通过 spanIndex 得到多项式的3个系数: `[b₀, b₁, b₂] = sa.polynomials[spanIndex: spanIndex + 3]`;
- 读取 delta 数组起始位置, 和 delta 数组中每个 delta 的 bit 宽度: `config=sa.configs[spanIndex]`;
- delta 的值保存在 delta 数组的`config.offset + i*config.width`的位置, 从这个位置读取`width`个 bit 得到 delta 的值.
- 计算 `nums[i]` 的值: `b₀ + b₁*i + b₂*i²` 再加上 delta 的值.

简化的读取逻辑如下:

```go
func (sm *SlimArray) Get(i int32) uint32 {

x := float64(i)

bm := sm.spansBitmap & bitmap.Mask[i>>4]
spanIdx := bits.OnesCount64(bm)

j := spanIdx * polyCoefCnt
p := sm.Polynomials
v := int64(p[j] + p[j+1]*x + p[j+2]*x*x)

config := sm.Configs[spanIdx]
deltaWidth := config & 0xff
offset := config >> 8

bitIdx := offset + int64(i)*deltaWidth

d := sm.Deltas[bitIdx>>6]
d = d >> uint(bitIdx&63)

return uint32(v + int64(d&bitmap.Mask[deltaWidth]))
}
```

formula in list:

- 对奇数节点, n = 2k+1, 还是沿用 **多数派** 节点的集合, 大部分场合都可以很好的工作:

$$
Q_{odd}(C) = M(C) = \{ q : q \subseteq C, |q| > |C|/2 \}
$$


- 对偶数节点, n = 2k, **因为n/2个节点跟n/2+1个节点一定有交集**,
我们可以向 M(C) 中加入几个大小为 n/2 的节点集合,

以本文的场景为例,
- 可以设置 Q' = M(abcd) ∪ {ab, bc, ca}, Q'中任意2个元素都有交集;
- 也可以是 Q' = M(abcd) ∪ {bc, cd, bd};

要找到一个更好的偶节点的 quorum 集合, 一个方法是可以把偶数节点的集群看做是一个奇数节点集群加上一个节点x:
$$ D = C \cup \{x\} $$

于是偶数节点的 quorum 集合就可以是 M(D) 的一个扩张:

$$
Q_{even}(D)_x = M(D) \cup M(D \setminus \{x\})
$$

当然这个x可以随意选择, 例如在abcd的例子中, 如果选x = d, 那么
Q' = M(abcd) ∪ {ab, bc, ca};

table in list:

- 链接列表:

| 源文件 | 转换后 | 导入后 |
| :-: | :-: | :-: |
| ![](assets/slim.jpg) | fo | bar |
| a | b | c |

[text-ref]: https://foo.com
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 81f6a1b

Please sign in to comment.