From cc6a6e4105c031b00cd82376cb56c66a0092301a Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 25 Apr 2024 15:05:49 -0700 Subject: [PATCH] Improve model quality --- README.md | 16 +++++++--------- benchmark.py | 2 +- models/dt.joblib | Bin 11033 -> 11225 bytes pdftext/inference.py | 15 ++++++++++++--- pyproject.toml | 2 +- 5 files changed, 21 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index f4f7ed6..6eab318 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # PDFText -Text extraction like [PyMuPDF]((https://github.com/pymupdf/PyMuPDF), but without the AGPL license. PDFText extracts plain text or structured blocks and lines. It's built on [pypdfium2](https://github.com/pypdfium2-team/pypdfium2), so it's [fast, accurate](#benchmarks), and Apache licensed. +Text extraction like [PyMuPDF](https://github.com/pymupdf/PyMuPDF), but without the AGPL license. PDFText extracts plain text or structured blocks and lines. It's built on [pypdfium2](https://github.com/pypdfium2-team/pypdfium2), so it's [fast, accurate](#benchmarks), and Apache licensed. # Installation @@ -81,13 +81,11 @@ I benchmarked extraction speed and accuracy of [pymupdf](https://pymupdf.readthe Here are the scores: -+------------+-------------------+-----------------------------------------+ -| Library | Time (s per page) | Alignment Score (% accuracy vs pymupdf) | -+------------+-------------------+-----------------------------------------+ -| pymupdf | 0.31 | -- | -| pdftext | 1.45 | 95.64 | -| pdfplumber | 2.97 | 89.88 | -+------------+-------------------+-----------------------------------------+ +| Library | Time (s per page) | Alignment Score (% accuracy vs pymupdf) | +|------------|-------------------|-----------------------------------------| +| pymupdf | 0.32 | -- | +| pdftext | 1.79 | 96.22 | +| pdfplumber | 3.0 | 89.88 | pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same information). @@ -127,6 +125,6 @@ This is built on some amazing open source work, including: - [pypdfium2](https://github.com/pypdfium2-team/pypdfium2) - [scikit-learn](https://scikit-learn.org/stable/index.html) -- [pypdf2](https://github.com/py-pdf/benchmarks) for very thorough and fair benchmarks +- [pypdf](https://github.com/py-pdf/benchmarks) for very thorough and fair benchmarks Thank you to the [pymupdf](https://github.com/pymupdf/PyMuPDF) devs for creating such a great library - I just wish it had a simpler license! \ No newline at end of file diff --git a/benchmark.py b/benchmark.py index 263246b..a2ba903 100644 --- a/benchmark.py +++ b/benchmark.py @@ -96,7 +96,7 @@ def main(): table_alignments.insert(0, "--") table = [(tool, time, alignment) for tool, time, alignment in zip(times_tools, table_times, table_alignments)] - table = tabulate.tabulate(table, tablefmt="pretty", headers=headers) + table = tabulate.tabulate(table, tablefmt="github", headers=headers) print(table) results = { diff --git a/models/dt.joblib b/models/dt.joblib index c544e078eaaabfc37d12b54114180fb25678c7de..57791551d2a1e468329617757c8f6a5b0b8adc43 100644 GIT binary patch literal 11225 zcma)ic|27A_x})42vJCjNK}%tgvvpSYAjK-h%wd~jU~n!(qe6j7-dOWii#{LNwQ{1 zl%=SUeJfj~lBn-B_Zoih&+qYI{_wo7*FDeIIj?ih>%3m~UcPJ*I;=RxkG9PD6vc|@ zWG_o~va*sTSWt-0&Q{KJ+VVYC7G!6#gZ&YtfA>@`*_v$SL=UHLI7j!UOVbuvIFYGV zPKX4Z#%4pdC(~)1&W;o^73oBJ%&nZMblM_2qC3IT%8^Q<<2h*jc4T{kGtthGV&zO= z$kOp5G``urh%MH1JPS>Ds?*JiY(t_FteuD!R74MfaZ?K}dbVq&37(;GPYrHuMWnho zAr|Y>mQ1y$WC->SmW&>5C!)QjgB`({N~BuR@f&H&riLZkIl3UWdmx5cI9U;$t>{{) zGjKQ>H^Tv#W@JX@L<-T~!pahviiL$Efk<&A5od|>*g4ocP#x^a76c1qzK9L>Op6GJ zDmq?(w%DHF;6inDp*kZ?78)n!Vgj8eVC!H`A)CwEyVyB;5FE)CNGRwue%-02hLaP~ z$=48W0r~4z5qw&nn5=KDSr&1kp z|I+_A^{0A~#GT4_L=|3uv;xg^(khY|X)T_`k4};xts_a26iJ%{NwQNDUWm?jT4BT~ z7F0E!Wy-I}Dj_XG(thz&Iua}#TNcWM| zaAwI=M<2K>dB(W2SBo|7HPJ?6nupawoCxDC4#$fj0wa=1Ct-li+}2dF8$8}TX51O^ z;I-u2WWwYxz~mQW$cG^Uj=wU5phJ7N&Q-$?;7Gj7xYMtxm@GY@tuZY&oj>eY$1MAO zmnDT1)W6?WIZEmU*%u2L5?nGD8fwBNn0zz~Xa4@hF0tVXQ>%81eEC-kwWes#S;dc1 z_Cy{g9~~8k8gF3RL9ieII)XG7*5-PHrwD+jnI#Y^h<0$7-QF3fI5njzQI{+za- zBJkQyHK!6ZW8)cJA5Vp!R^ZsryY4_}bYIgmNuSrAnC2@VURBWsu0a<6M! z#>~ID{IB83N#vrpgOd$ve_k8Ed0OgUbxa*@F8=KSK7*5&-#~i?3xneQ;8pJknVV_< zbiN2;15VEeX^y-N`lcovVcvG+Q5FnejA2MrK4Kr)g5S%ue|o;tc|VKKw;`h`U#$6S zm6Aph=)9*e__C#>b#Z%lG4;=7znICZFX2_3IK?P=e;K zWcTLc1uXy6F?Dn1Uz|h2k6`wA&}tK7P~3`N=#5`nJ8%6XE66?1!#={XMQCLSXsF2x zyEo(i6__ELF=W9I0d)!~1~T@?mntT)&5`2fH4jck2HHS2vy5%d~$b##@Ww^m;;?>Pt<+;Hr1Ug8;jBxb*_vsb0QJ zOjTPV!_>bD zsz5yQNY!G>O#NJg=@rB9zx7WnjVQLc;}x7ggX-U9|9aO#>iqW)vw_S>`@?zALO>Pl zOH06U?^`@?`>9vP2G=Zp2X{nK{X2NZJ#h&OnS61~ZV3#h&jU!aP(sD5P<8mh>Gj7u z;NB-xAk0_y@o6U^rvK;i-)@Dj7B#&tSQm!!`~KVtFz(}E>Yt0R?aI?;_3RD!X`_6e zuM-^s6EpTpqTRUl7|zO{LdjG|klx`Ye%h}bZVsdMFW^hbp2)PF^Vjdp({dxW1-dSx zp6L&fAN~_Cvwxsf0f*a&>Hly1x${}`y^=~G*A~5V)~v=)xNN3=N@KhY7|zPy42>tY zkmG27JF4X=Nb95ecWrw|+H^{8-u55yerGY%QVp6yEDQ}MQPF2SMdyEhIHYm(u-9Y? z^r)bG{if#ds0wMO{!N(OG8oSCzxwCrzCa(W4qE434Phea{@eNMhswvt>zMqx;(x^Y z){5188zC_h)nC4-%$7JdbAFnOkNf;EJnmiv4BMdolM#&Ne*JT1{iC}A4!7lh{aU_Q=icwe0yMEMs3j0^q84n!j@dxMDJ+Vc=n_mM{H|PFYRc-t6lV%yvcA$)BqAk5E zhGx#MbL9`8Y?$B$QUlnWNBKRDOZU3}nW;b9F++}E$c-TazJV{VL!MFRx&24lU_uQ& zzpp;}qlp%&&z!$9Ca;Cz?DNZ)M{%$*YAi`9pcRe*dgoC`!sq29n#}pDU_5mU(N)H1 zIBxNA0&1n4>b{?CK#fAe4FsrW#y0Hz~F{%jHIW)6=cVhYh!&&tsqho7gf&1or5q~S% zAiV(%w0{@IQ^RnU{|;%s4F|J#1wwnyG=a%QR6sJmZCpQi{`;?XewK6l znpUX66m0J?ieXjeocH|Azo~i@5#GV&y{P_d1KShv&GYAz%&ShYJnser1C*Z^n!M%Y zEdNa#vJaa-ddkBZxHj;duxGQ-Y=X8#v~QT#+P9G?!StUdCW@KId>%!++u-xn@fS*^ zZy~}S?b%Q|tI1=xS=tj}l9`R_kV4@mOX z+-?A0?4TF2|3KOAg){r_T=`cM;=GhV&4lc=sDb_mnnaF$oB#UXPo(CWH#Y$R+aTz7 zc}tW(&8+{q^sA@vejM4-26(K%pMD*vaVTr%`D;Jsx5F6Ds-I?rk#Mk0A276eRS#Wv zSQruqS`H_E(A~!L{{f752*X+RM~?GKIK01@7+K!+8v6ZEzVN2w$tezUO#VTPr-LC{ zr5KHjqjRIMzSQC@`a@^bMp`gU3Bo z;Jag$$flH)O#WQ{!MfuD`ri+KrRr`Jfdh6>DDChRu$;C2^)Y*~@_<`BV}FLZZFhm!(~f_p1MlIN z33{XB!oMrUH2!(ppQBK#bcv-DIIxXQp{`hhGm~}R{#S^%D8=`*!U%Rik662N+>12x z{4`hm8!E+Kg^PcJKWk9?FaNz?&^qh=nISgclNipbKW&#ay*w!}KaSIhN?NDqy!nua&zoQrreLI1}&02qa z&dXDP@;&b5UdJ}ba6tvt>Yl{hJU{dNbPVGk$8gsE6<1Un2AY=*TL+)C0bmPM`9uFT zonO36`{(js|4YAILTEbx=KpIfr+axUX7Yb7eoj|zenaplz+n$M(mGoc%Sz`TzZKHB zgv=i;a2k8i8H(W2Xce8mekJ_HQ@gjf!lezUgElIT4ZdGK!^hGZXN~!9R{g3GcQYz< zy`y?m+q4GAW-JVehy6vCBL&LL_?con0*14mztp!iMS#0vjwruoB^=m+@)eCUI9b}2 z=Iwtz9?i$=k@|asTNGI4;*1yk3RI$ zmN4Ttm;K}JNjI9#H$vq~RKc>MZAN38X7Xn)zI&wxO=$Q9gkv8_K9wq(G!D+6KapL5 zKUdibVv?x-%5dSLAG7Lr#PkCoZJ{fX;(~ll{O`vxtH zS#3|~t*eD#jURix{WHK^G%M$n%3at|N+$Q2SHizu_n6kJUEosB<>GZE7l?O6lRYXM zARsH_O8TuhIW9>o3q zc^&j$80Me()ehd=7k~MHH$7YNV)?l1w+`1c#ma@F*0FCUc_KOWlSY_{&|KlAYii z`1?mP3hOgN&6pL{ac)=~H=x}ft)Z>;ecperG!#{XO z)7y{1<7k`EWrD}S<@(o6N4>s)Q0 zW}mM{%Q^53o7{LJ`VjOtJH(bUs{m`b}kkZN1KuXr*+Nc9M5Mst(|qGjO!7w`&8lB6w{%va_e?! z(?s~~zlG$P&`WNip`OrnUC@)IvPXF_Px+R1^M0q}WwWUz8$G#uY$>m75)4aR-@+Z1oSgAXPqNnJHvaA@r{ zz2s+B@TV)yO|JGlEQ|iCdxQTTtXptxQUaNTwiEbrkET$AGp&T@^d1sv(H zabn}EpnC)GV(fUnNm2qRe|_M-xzZaps0Jjka{0n7i5=^@OsZfaSd1c*(g80ntzgL= zEe9h>Z=T)=PIx?6@msk&0{(>z4@_gYJ z?|0V&l{Ju(BXhRTrweWcXo^pkXG6$JAEV8Dhhg&`ams|p9gxgZ=l>Tm1XtAZ6U86u z1Fui+c!abJuqOPC-`E!l554T_-Ry_qb3@7A8=U&!R2r?Wm`Q?$=(Dj^j!)nVD|c1) z@Bl;;8W%Q97=Xkv>sM65Es(jE;dW?kKUm0~$r$@t1{GmhpFfT}!M6Qem!DakfI*ze zo~R3<;Pb~?SlQr@IH#Y21TudH9@P0-b;!h2*zJDlDuy2Wzb4Q#!ZvBe)v2btR}9L7lg zCuBcYrwlZL>Ui@i(|}!||KKreteHHtCD`mPcY6y8qJ5_SkoPWJUvtLn)_cQtyNtw& zLs{@{rNaTQv`#QdGobA_Q4gHqIT1#p+VFfseR6Js5SU&&Bz0v`4{*AZKN`2x!@x;X z9%&9Eh*!Lrw&c4MeApefbE2dJj?0ftun;~%^h0B@5eXIS?FbIXqF%SfRiz(#$gsul$!^+L&X#Be16 z>MkvLSuN=V8#u;Jwl_bArCAL;!F&VoZ*1#NjXZ5I=sQ|rC`p2!mHBq(vZ^43d{FIG zRv(!C>ZoE5u?EqSYk?K2Zg3%DuUS1l24r5Wd3~d82zWhKl3zdC1NEy<;`b;~;boUh zf79D~xEidiH`dz?Ma7*i36cB2bl~S7+spo-s=dqo-Gz1-PP4U+Rc;2;E9$)#zqR0Z zgC(h8Yaj%v2o&W1?FVJ|z4wO6WpHS;a*toc0myLPB_xv{44mAavSATlz}r9nuXjZO zq@FVy*8Fz}E`=?vb>InuwGUnl3Gxm?b?y70e+3U=IHp8yzr${*TF%~bIwTy}niQ@W z)qH`Y#@YU%PI*wua(TT#;TC3m2Kew*#Mc8bAudSg_kId~#mT3*TaSa!&E1?2;~&C+ znw%(gVLuf0?RAYL7K2LB!(_X3V~FKhX=VK&1jhd4CGvI-!I3hfGeqt%uo3ieK_&sy=ktl#7;cZigS@9$gZlEl@XPz2--%JXCv{e&puS z0o#^M1$Bb&;dXFHe#@6mc$(BwTRfx->R%r;>KB}a_FQA@?dz+cd^3C9_Vhm3c5$l~ zJ<10BSZ&Nx*Tw?%=M&lFgl6cwao{w2S{vX}f|m2Sl3`M+_&iMDldpLJ2 zYvUTX4#uT>5tK(UdJE*HHHPD|;X$}{T*mF*^xW)(!J-qFHS#S;zV_G}a8Y+Vo+(dSpS ztpVa|#CDz&P=cKM56IMvBXIYP$#w(w60ni1Di7WI1!C?Geu`F6fV(YC>h;~a@J(Ik z#|ES40J}Ddj2|2T>W3$WT_j^TXyPF3^wbRqW^FbVFB@Pb?<1E{lP>UoR^jnIXC+t~ z_J|a=$U=TX{z)JGM=)4W89L!K2>XdTy!>6F;1W|Dc+h?e{3KNrs%oc!wrjDjweS#B zaJ`h>n<5B?c0237krbfkY5G2CfhS;jFv*9sVi3k<{1%9Kt%ZFnH}5l6T@QmVdlfYO zVu5wNV899fVQB35eCG0MF{pXAQIzf}1Ce18RSFcGt0 zy}61!ga~sBUjOzQ^qf>;-JlOn(6_&XQ~W^L+@9KAC=Nk!G3xCj*`O*|?;s;K2$zI^ zQ{0UAfxMLj_2yO=kgJM}>*4JH4il{)HQy$P5JR7va4zOFJ_7b%iv(9Ir>XzXMgF^U OtE__THoA+s?Ee9sTuZ(H literal 11033 zcma)Cc_38l`yXo}OR_H|ArTSDQa!RW*|KCS4Te#eF-DdmT`AR6(omt2L{}+NiLw?+ z-Lh5Ij6%wit3{jq4(FKhyWc;UKYZrA@AG+|=Y6*KoHOFMQ*1eL(;vEc*zO=Aff}Mo zqY{al-o8PEurOj6gDzo4^bHIPB!@U7^H z(S@e2r)0cC$bQo!i>QPUKXR~l7>z(9GVp42@u_74gDK&N?NNwfzEmP1jL5J=oq@yA z7fm|=(~RuMhY&;v@g@2pTk-X!coTvsB*H9l!C-O-nMMu?^!4^d_KVmM!m`L4QN_Ru z(D_5W$>B6gI4umh9mF7{^7m+T_@1O@tNhJ*)GqP!`AzDOt-^u=~lmz$|nLew@Y zfkK(>biS}~pJ_gW&NIzo&=-XG5vF93$k2JhNCXP9QSJofJDKV?6$uVgR2=Cxv0)>%CmqqQI z)`oZ@Fd!tD7((;*iJ~FS#*2jeESRDrpBZnWI11RcF^YV~DV5mBlyBJX5T9OO)0a#o zY6gb~(E`zZ(Rsssktj^19%&_=GjNvKB=!F_+LVtQTb*$T*&JPPc9u*Bgl8(%5%-J! z|J0uvMG|)^-w{=K0g?=w=_GlQ0%6TM$qE4%5(3QYI9)983o%4sw3I^{3Rizd zq~3$kjk4*P&boiSAKwYsJj26!Ax;|OEnxGL4$`&*_ri$@PNhDmU4CzRCP+;E{aWws zEIt<|&yOJ=n@<@pkc8|7y>ii%Hpr^ao1SrOh-gXhvE4i)Hj_UydS})_<{b`L$5%-A zMw_2q{vIyh=bff}8y{Sf>bZi&=fM_2XK}b$`r{NnKdMOT0Ez1QXYh3!D(HXVHsN86 z5Q~pyG5hZy>?^ixWt@cP(n&Z2A(H5b`KNF0QdxhDv3VSuiGKSpUuuPqrqtC|IE*n& zEvbPa?ri>yy)$u}*$28l6R73K7mEw_!wp|_rfA*79-Bx>mjCC{|MNpE-^k}a_$`O> zbNW@JZOMZ3-v5Kt?z$Cxui=&jYLVXK#um#194x*7<_HlCXW}_yf8Abs^WyQRFe#Kh zJ*59L$UgI!6^p+J;|XFo;}7IAQ&T(+Vj1{KvA!-)bU^JN4`{wfZkL$%{rwIafwHd0@JbkBUjZePUnYu*2Wx=yGPfqag zv-k>_{^c0{w|{2paxLkOsR7>;X#7g#c(+#W)MxSM%D>U>6ffNk_hF+UdeU1T*tcix zE6sWH_X(xmd-=TqRGd-!C&iOA>;{*!_$x5GRWO{@KN~vjj#cp2j&yK0bb$FqRKK&! z#oSg=cKs`1JY@`L`JZVobOEkj+anrO-UOlyR3Loc%Do|e;;i`1wf}>P!XjERwP3m% z<=e#N@0UL%z_NcXz83Mx%}kk2c!9PLoYsKJa4ebKe^+95W7D{qa~v6F{tlp3gi>PL z%2b)4BY^Gyu4Icz)zW$ESJOFa^G&%6?(3t9zNua~<@jp;{bNUBj{wK7R)}okm|mdX z$Vx!wl^9FE8fL#bhO_E7DYyARMK^xvk+9=iP`Hiq1A5IieN)k5@#peil6G|C((Bdm z1`D)UqgJho;sO?bHKtb^!&&}IN@}K6G$fUL%l+B`CMQw-t1>3)eG=IDqk-|z(*=h^ zrO?6hm@E*wxLml*FB;f9MaOaEHTZXFC~)i1N|lzoMNpd z7|yakbB}a5ocaAJb9*oo!s%%LOkPml>bYnui?4_AW(LqIogSnR-kpJ45nSI*c;CSC zVss|$($m9VLfGg3dW>(3;jI2uAm!y(Q6=wrwRzcFDBr>{Epc71%`{e4oz=gvvc?%< z{8{x^xi};lp3>AL_+^>!cMmEc=l><{rS!@fb8!Y3&k(~|`RAY$M1h~t$vR)IyaE0o zRN#t`d9sZNyZ+~j-|gMO22#!KP=+P=@9rIskAJ|i^v}iD=|4VhA;|<=Y=dr=>f0Br zXV?D*%`24aK2SS zT?ZT3`searD)rguRo`x?#Tult(e0;I1w8ZC-z74tbbVJJh-2s9FU!*cmwDOoGr{(c z#}K{CnZ6k9G<;A&x@NKX-T4Oa-Ni9I_&v1j>8@%GR{ok|JSz-m)lcSu^F^SvxkLc3 zRSzZFDBqlBacJF_RV=<4#&F(`w;P7D`)BXF6wv&# zMn*oU9h_gHnW55~8oaTG{rq8v@tiQ6l|LeaVd+q=ulwwy9uxMPp!q8~w1TswnVtXk z7;h_v=qZC;;FvH}9=7%(bb&}NIz|~wPD^lD&hozlChCYG`jpG&?}!`e(9L)SgaCBR zA|+~h$aF+i^vmWI@Yk?hiO-3V-(@}v6am5$@ywdZo zKV3nk<8KajKnu42Q2N7X7gN~%4{PhVZU3vkGC!vn7|S^gdQR0qx<4ww=UX|DdS7JT z`nL>zvo`5^4i;GdzV-7TdK{6he=h$;j$PIjSXB%z*o}_tlUru_i`nsW$L#jNaQ6Mr z8Iu@zV`q5aUQz?ROh)r}^%q%>3$5(?m$~#`@D(6l;$i{?tKWpQe{QPjvHPDVrgtZX zv*L$q%gl#Sa<1Iz)_NfNq5Ao)6uH|R4O#i`h4H*G{BQmEJ<7SUx3UUEdr$#kih#)2 zR&5r4JI33A;eYFgvfooDUbYl6@1O$3oys+*&#h+h=i2|M=M#Ip(QV*Ait>5n`IbJJ zRGxqQ%lUtLnAAclcK^PNDU#zYRc~pQIcF;GSHv3_`W&Zt3%VM#%UPwNCQ9)PSpj%HZ&|Elg|Euyz z!|@~a;2@9k$3BOrJpa_OKyY!ENx3qm8Nxc+K{f>P<`eP8>wy$=7^;LQ1_7-=cOXrOB z>cepU{KkEsE`5R+@~7~oP6j-mK%NQR(_!d?)6Te-I(QR zjl1jG!LM?x^cplk(~Al}6U_n;<013O8Fqlci9ez@H`PJx_ZobQMl*2Iw2m;}-hul= z^2)l($*}pL`_iGY0f=bI&7o)4!eD4v`&+j(@DTgrCAw$`J_O&Ml!`;%(*_>-GbT#{ z7u-0n)%$QTvM#WY7;A*1ZHHf^AL)bP^0Ol~S;t_Zg|Tb*P$raTKdyS^`3??>l{C?M zn&G{3T5(3#1vo$-{dZ~IHJEI?e)w`-86++~VddaI2%H&?N5cQ1!JYGl!2--wi0s<7 znCVymErU5c?j1wGr}ciq{!1))blULZlhVPi(o3Y~LJj1t5I}y7|v}L!Tpvk1k$#=;N8-@VCc(Yv-d@TVEz2k5d*tH z*ekyJitCqhC>3%|Y(9SoE_k)}iD{pJ=*wr<+l!7s)M|mW9ozHaqm?0X?5ln$WrQo4Ynr6s5P@U?=IBjT&S*4?@IRYXRc)fA0Jiy)N zsm{H+3m$V--Yi_^2U}F%{M@$t6DYje;-$YL865h@5BXZ_11|T#%aSY+Xy=>gcBuG( z_?fO@$gG8*9`}9PqbX2NVSFq8cm(Vu^G~l&$N}xiCC5}A^ukwH0SW6}&*54TrSSZM zTR?0+YiP{Khmu2LxA%>G0KsS(E~oZV$bWJ3SH{wGsN>8sH%|?KJwomVUhCVSTkrL6 z)#&?hr>*bzP-!>t4&>&F^xlB8cdK8%IrR~Kq<>x8)l>>({||*5s;Tf{!vk$zn_KWK zy7^skL^=hTX8_uY)PNud?9M zNaP}1Za>6tQ159HY5;D7v}YO8RB-GXkE$9t3K!G&FV~=cgb0~JCBtVA;q6s|pirWBqjeTnI(Rc*caw_baODS-n zjc}Fdz8~D}_U_T1d=(A|B*4|>osRA6m@xj{q@>ET8-A{rsqntz z4V`!0su<~Ea4nCQv?lKz^n+$h%*IyW_()Om(M9ZgTP&PniP66f^+W2#ltRCP zCg?4Y$t-O4fWBWfXYxn(LG2T_4i`ob?9j6Bsj=#WJMtfHq-E60lL{O}8#@-?NuukJy*Cs7(y6I1T8q4Q@@rn?<+v3_^hKI# z>^Fc)V%RmCH67r3G=b}?=Ue#G)SviWG8?X?f4p)}t`oe~I5*M;UO-Z0gql!LBglvL zow6SCf}v%?J~WGy!2Q#;q<`%o81M7mtjt>l5nQAJnA3;?iCZg%|BdAn35WPxb6Kj5xQOn2j^Fsi=@Ai*^F7Rq(hl`$1Kc35#DAXIE5HGYr9hi@sfby$}Yq#rM~0 ztFz`;##vN{5(hv*-6A2#_c_?1p7~r64qpH_-Lp^c<~xutr+PIxHGs|1F_&!vD6u{sxq+`zR+B{oq_nd)e#1!(qwCS3hl^d<098 zF{w|T4NcoWdk0FtR7f5Pb%8avTpco+_5e{jL-pG6LE!i5B6Ym41gp2&Y6@Ii zVWsH3?XCDINc$EVdL;(A-{ENO5DUtN@eK#B#}&vK0> z$`8WEY;ob^PnJTyu3t^k6Fu;~lkX-Y&4k8{S&pk~yCIeTL6yJ#GPtIk>-NWaJ&Z{Y zR<3m^htT*QI!${BLSum85UKzslIH2o(FSnwMpvoKq9S;6ZGp8#+AuuMIM%F}D-F*y zioQ8F=)uIlUl`UN*Wit@r{Zl3 zz#jeYi3dc)Anh<|ccG{)*hbz}bM&|ek%A_94MRhqE&TY7ca0XfZsaced!Z>jRE*dC zu3ro545K>xcYTCzbG3|43pp6p+{jp>qXRqETu(d5R}aS-6ObDH0R)ncN7Aa5;lw%T z8-Kh29_Zc8`}VR84$%+4dm#J{#tii7ghz|vd2#WL1)0j=(l@z{?`{EXEskt-eLoC= z#5KQ!b}WIv4X?_l+O>1uwt^Mf^Hh@DnUmDcafgUI=g}J=grXv>aZqQ*;yO wDhF*SN?EXJ2vQV~$C9Z}xDQ>#8~d+6?>$87)c?04{~f8VsjIn`5$>b;e{a&5rvLx| diff --git a/pdftext/inference.py b/pdftext/inference.py index b5631bc..e4352be 100644 --- a/pdftext/inference.py +++ b/pdftext/inference.py @@ -19,7 +19,7 @@ def update_current(current, new_char): return current -def create_training_row(char_info, prev_char, currblock): +def create_training_row(char_info, prev_char, currblock, currline): char = char_info["char"] char_center_x = (char_info["bbox"][2] + char_info["bbox"][0]) / 2 char_center_y = (char_info["bbox"][3] + char_info["bbox"][1]) / 2 @@ -42,10 +42,18 @@ def create_training_row(char_info, prev_char, currblock): "font_match": font_match, "x_outer_gap": char_info["bbox"][2] - prev_char["bbox"][0], "y_outer_gap": char_info["bbox"][3] - prev_char["bbox"][1], + "line_x_center_gap": char_center_x - currline["center_x"], + "line_y_center_gap": char_center_y - currline["center_y"], + "line_x_gap": char_info["bbox"][0] - currline["bbox"][2], + "line_y_gap": char_info["bbox"][1] - currline["bbox"][3], + "line_x_start_gap": char_info["bbox"][0] - currline["bbox"][0], + "line_y_start_gap": char_info["bbox"][1] - currline["bbox"][1], "block_x_center_gap": char_center_x - currblock["center_x"], "block_y_center_gap": char_center_y - currblock["center_y"], "block_x_gap": char_info["bbox"][0] - currblock["bbox"][2], - "block_y_gap": char_info["bbox"][1] - currblock["bbox"][3] + "block_y_gap": char_info["bbox"][1] - currblock["bbox"][3], + "block_x_start_gap": char_info["bbox"][0] - currblock["bbox"][0], + "block_y_start_gap": char_info["bbox"][1] - currblock["bbox"][1] } return training_row @@ -80,7 +88,7 @@ def infer_single_page(text_chars): span = {"chars": []} for i, char_info in enumerate(text_chars["chars"]): if prev_char: - training_row = create_training_row(char_info, prev_char, block) + training_row = create_training_row(char_info, prev_char, block, line) training_row = [v for _, v in sorted(training_row.items())] prediction = yield training_row @@ -97,6 +105,7 @@ def infer_single_page(text_chars): block = update_block(blocks, block) span["chars"].append(char_info) + line = update_current(line, char_info) block = update_current(block, char_info) prev_char = char_info diff --git a/pyproject.toml b/pyproject.toml index 4b21656..8fd2c7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.1.0" +version = "0.1.1" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0"