From 117b12a698e99f8842e69b60445950e53fb10777 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Thu, 29 Nov 2018 23:27:15 +0800 Subject: [PATCH 01/67] * update README.md * remove torchvision in requirements.txt --- README.md | 47 ++++++++++++++------ docs/quick_tutorial.md | 3 +- docs/source/figures/text_classification.png | Bin 54120 -> 73437 bytes requirements.txt | 1 - 4 files changed, 35 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index be5f78c1..c9c934eb 100644 --- a/README.md +++ b/README.md @@ -6,16 +6,39 @@ ![Hex.pm](https://img.shields.io/hexpm/l/plug.svg) [![Documentation Status](https://readthedocs.org/projects/fastnlp/badge/?version=latest)](http://fastnlp.readthedocs.io/?badge=latest) -fastNLP is a modular Natural Language Processing system based on PyTorch, for fast development of NLP tools. It divides the NLP model based on deep learning into different modules. These modules fall into 4 categories: encoder, interaction, aggregation and decoder, while each category contains different implemented modules. Encoder modules encode the input into some abstract representation, interaction modules make the information in the representation interact with each other, aggregation modules aggregate and reduce information, and decoder modules decode the representation into the output. Most current NLP models could be built on these modules, which vastly simplifies the process of developing NLP models. The architecture of fastNLP is as the figure below: +FastNLP is a modular Natural Language Processing system based on PyTorch, built for fast development of NLP models. -![](https://github.com/fastnlp/fastNLP/raw/master/docs/source/figures/procedures.PNG) -![](https://github.com/fastnlp/fastNLP/raw/master/docs/source/figures/text_classification.png) +A deep learning NLP model is the composition of three types of modules: + + + + + + + + + + + + + + + + + + + + + +For example: + +![](docs/source/figures/text_classification.png) ## Requirements - numpy>=1.14.2 - torch>=0.4.0 -- torchvision>=0.1.8 - tensorboardX @@ -39,12 +62,12 @@ pip install fastNLP - - + + - - + + @@ -55,11 +78,7 @@ pip install fastNLP - - - - - - + +
module type functionality example
encoder encode the input into some abstract representation embedding, RNN, CNN, transformer +
aggregator aggregate and reduce information self-attention, max-pooling
decoder decode the representation into the output MLP, CRF
an open-source NLP library
fastNLP.core trainer, tester, predictor fastNLP.api APIs for end-to-end prediction
fastNLP.loader all kinds of loaders/readers fastNLP.core data representation & train/test presedure
fastNLP.models a collection of PyTorch sub-models/components/wheels
fastNLP.saver all kinds of savers/writers
fastNLP.fastnlp a high-level interface for prediction fastNLP.io readers & savers
diff --git a/docs/quick_tutorial.md b/docs/quick_tutorial.md index 958ed320..64c51124 100644 --- a/docs/quick_tutorial.md +++ b/docs/quick_tutorial.md @@ -1 +1,2 @@ -# FastNLP Quick Tutorial \ No newline at end of file +# FastNLP Quick Tutorial + diff --git a/docs/source/figures/text_classification.png b/docs/source/figures/text_classification.png index 5884c64e8bdcaa2722d5a27d9989b976fa949ae0..183aaba9ed8cbd45cdb31f6675a6b304cb8b262a 100644 GIT binary patch literal 73437 zcmdqJcUY5Iw>BPR02L8uq$^?vB1NPLNI)cjC_SM!MX3T(LJ#1eq69$&0cnv?5<-bc z00~tcL@7dOk*;8b5Q=n2D8DD5GrsfA`<-)s|9szda$Pea@ygz7uf6sv_qz9U-`G%> zXSc|15D3I`QBT_x1mesFfp*Y$asuCk7_8HQH#R?0T`f@Yhl4cW!!Iu94bFo=CGp&A zS2%#ryKd@P`GG*fJ*@xOgvd=(Akb#+MeXyjAp6Pn&59EUWjY|d-nbWnDedom7FVUkI!OWDc#4I zwC@oShV3}tDWq7`;SLjtx_#g|`DA!+U>&~AW1cn<*j>y>$JZ3NYlwN&^r*PZ^uZT< zT_|Id>%nNFZZl^<%@n^vlLMF`6 zAuWDzL4M8DbIE+!sgfGH`h)XnwI}huZ+MjXnst^ROr5=_psuu~a!0#wGJ?Qp@AmN= z_EvHdzD)q{LwF9?A{$!^-S)R8;xEYIp8~SMYRnZwrLAiZYSd2G%DIQ$*xJ2%(kD9^ z;MWh2h-ePoczb527{5=VO#Z%ZXC)Y1K`nQ81J1dr?UCuC^&ko1@iI(BU{Lw%Ran1M z)Ru|#aLo7u))je3o4v*9&^4n)yVwrN5BS|CU;($K2T5}31O)aj^NYAfWyzNk2_1w{ zo5QM#`sdsGMHR6 zXI$h+|Fo-J7kJfSeCAE1w-uUoDb)Khr}FWb*h2@%OT_7MIZ}9n_A>lL1#W+9Oot?s z7-0aNy4ROs7TXa%UP)nKrp9MguREpRL(s=7Z^8`lH3*k=-25UvY1 znawg3qsJafs7>nkl0{Bpn9VTT`@3~a7OL=S5bZXL&P7Spub3T1#2fGqN=l!Ibm=|& zDgy^p-w@_^6Mre!;Y8o8Q~GvUdjTo;Z8;E4UJP45>3o10+li1Hc6iR;6+xmrmh^I> z9_H2J+?g0oni=^F{36#tnyDHFp%^^mOO(`stLvgn!}l7rMWnqp7nusoiAU%Y!x(B- z=5qRVQRz!?QL&KPR6gq_G9(eDM)rmGUY65m4&()Wbwy_%O@%gU_1C(ZeCB28>cMwi zxjqjtF@eg%FpY{AyGe?g2c-$OoKc(=*{|qxXAtitb)f7c`qpRZrfFAUhUC}xIca_| zac-E`T4MA>*WQK2^4O~|_?5DYn2|>}zOI7XdQeIEP0leUDt9EX#fRaD$+G)#zp{*J z_m-~&Y|7CS{z?ze*{^sr17djRfvYCT>#!vyL{gUES_9{JPBN>2^VoWu!%ebf4NxZJ zSn2iR4q;`+I-FmbnDN3@MSaYTtUXLGA4fbA*k4o~`=S@2AFPr)QFc)JK-n;cnvJli zQ6hR(7&#Q!r%}t@dV!w|H5ehg{rIH?YYcPTneeWjyk?6coAvY7-(auuC(10ljbKGK zCN`ambc9$apk0?dr<%P(d(f}=RYZ{mGiz#MmfCw2S=iY`hvV&=$uHn-Pqdqp zDMdSUiMG~zE<@`qPDnRFp?UbR0{rv+`F#OT#+L=89T0oy%NEc!qu^kur))vr41Up~ zsA>Yiic1U2?^&i~>Ey}f-p`kOT;wt#wfCsCMewSG$Tvf-sbiij!;yOyzi58H?^pig zu$}EQhRyVFbzS(K0~xiRWYvP0Q`Vc9v08mGIuTGif$C~Cb>WHY>KrM1IKi16ZW6kv z)-hk)I8i31&eazecZ}cGe+@3`4R16&-bcY-hcer*t&Rr~LPf1%dS*7w_@%1pvi;Hy z%T+)^P#;2{Ne~4DeEt@5s}Ii>#v`HIOu5flLgySm0ZvRbQ|v$9&C*+zKz5%i>4R$$ zD*NiKn4MrvqRNyN%dn6viF!#ppv17?K{Ea7Q)ql1$1%6i8wz%T3kSRS(M5PYE15G; zW)a0KV|=l^Wtbb7?rmFLH{v0}N+hfwOn0Cutqp z@|ds^brD`vI-wGaIUi9JYLfLFI7Xa`)4+1_1jX&y; zGgV@Om`ny%zMvwZVJNB5sM4tAU){4_3~b5sAqMjr7ch5W$=u;0Q^iOJ#lsm1&U)H9x^Ec^Z3_gOvtq%8fwcoDP-mC0V-=ZM!{G{uIPNwe}>TQ#C{#rY_!>nJGV2v6MUU@;*^rDfJLD zJ&)zdoG9z!ti%A+$g{QQV32mRELe${W%fu>QqT1%w7&@a!T$*OoGUst6HmZrSbJL~ zu&v-1X(eO`bP?S2PQ=t?C$HmM*)TDQ4}UjjpIrpLt`LhViriN#QDB`!nh zFg9?=6upFc{fS<`Q#Os1z1}_J^9aAgd$lZWp7dZTe>#u{2h;7e>m(YYl4J=s@v^enNCKw(NwP(?m;twfKn-W&mRNIS zh=OFtM4558?E>RLrqfoq8UO9O_obe<&UFVlV9t+{tn@{UdmNu15^P=5gJ(EjW^aTy zq>;2N``MPuyUJbH5zOOoXcYcD^CbH>@)@#YEi8jkRQ&1@L)5kFTv{Vm_Rq8|oP!Y9RRb9g|R8O3J( zI2)zk)V-+`PpbvBe5M^c2;^8}vX#(>7$3@kW6m$Zx}|bYk`Mb?-+Ye8Tp-E9*+<~) z6p_ieB%@N-!=b-=X5(LWo)k$WbX)_~PE@)jj0eYZ&h;Np{k5}E^@MdfD~#D#JiC_^ z-&6E^r?Wk;%$fUf2{QIJvO` zNRsE8WpwV_^Qu6Px84M!!k6J>rEH8>cX+vqQXgOT@#R2Ui+hjr8nt4!@`t`>Tcypy zvvYa{3X(b(({B0l>|`oj&&NMKPAbAbG*P^Fi07(>T{PH0kmwKs>Ek=n6?YQ5>xL?b zfOIu|X!=U4S?5j-?yzau4Lg`)=O})mrco>dd729-o4zd!7oq~Nh?19fqO7~`G*&``fogqV9(X4FF9ul&a*deSzEn>o%` z7%)ZnMGJtu6)>Z257})WHT>l)*{^V{X3g5zW>GCBCgNAmQz1Oblq`w^NmSMX{>3c> zaNB+cIgQ$@frV4!L9UEcw@|5j)3O?Q|KcuK9t;Hf-(MsbpSY2gZ$@3ez+`8tiAAwF^3v)g%6Jw&uxp9C46!|%&Bl-yBBdgjI zIs++Xi3R)_+m{I0Gy95F{MA34eagYI7$67f%3HW{kC#>YfbPb&NxcKc2mZ`f%H_Qf z_8gelHGbK+e(q-Vk)KBKvC{{V)uo;*H`j79LSsM$<$D_sFwSrP@@@1HRwi4%cZMst z4VCq&?4B)OcI2Kl=miq^&f$(=*%}3 zI1IcTg?g@}-z|{8B0fSn@@d zWCLS=0+MRbIQxto@V4(W^5{?B{-r+6FC`bEQJ{KD*1V^Ue*x9syiA)pv-Km;9$~cA zLjW9lcxw|-m*d(0LR)`s1u&sdnIuwZB#KqcMjjcJ=l0#o_Uds4v^V?2CV_AJM%DSA zpF8ONv(9ak0XXVe4MOOkhTB-?(RzH*o7ubJ+t49y_R3CH)>!n0d46dAszi|V(JhPH ze6S4*pWZ`1eb$f-bm5i9uUr0L+@ECwz{Xw?ed{KK+HpF7wG-9bUxNM{3a?RFSlM4{ zGyPGcDtsoR^c11abE&BSqSPNi<8o<3HdLWBG8uR@qH3DrYi%EYfCR5*`=+!@_)^|c z_X9_A(mn+t?UTeB<|woSdSUw&5( zTX+{Togo_bjr>577q#s7v|}v#q1-hvruZ7kPBs&&$}p3pLGdneyZjiIMa7V?Pd?kIX-=TV5A_tXNu&| z?6x|mO0+ME(5v=t{^AK?MAaR=)^61>OBSyf8@6A@-hLR@fzTi?GR*Va%7X?sztY-{ z-S$s`#M^*ClYd&v>%4)g%}^+=PtYn z6$4Jm9jW1(&Ck6ag!P5$`QW7_4`<~Y8i7midc~tT@&;Uy)`XrJPq(VR4RFy&oyow4 z>o@enBMpS^{F|W|HiVX*J!x8;WIqgE>=huWiCh-H{RP`LDgfA$YtT#-a6H@lTxLk# zOD$6~>4E1gysK^o++3EtB9~v_y51L8v05lf;b)c@4lMnT$8BP2Y~%(<`_XzXIWyZ- z6{QX>@_QQfFG5@+dhYZe+|t8JnJT}Rk-{^>o$oJE*0m{gn_Ni$2tqfCcKhMlLht}! z+k)OFSR5G>p+>%)6${Z4$qNv^s&3+Z3b(gQs9Q%6eK4;giBf&Hh zdOQSK(B)!NJ?(lrW8Y&mYx6Ya7aD#zGX>v`1CTQ_u8?Y{K+D6{0YtGO@u}gi$|CY7 zjUyLg916@nFKM$++4oQYak14~7F}=%>-eA!jf9WZ z@B+vCwN~b@i?PYIo=bP^uy^ic!Kw3AhO$eSn97U7rqkp9K}a*|nB>-;L_>Ch%)Lb! zS2u{)tP!xo^AH)jh-tx!H(!OE$eQ>yy;%$XsZ6kvyVks_eNA=QP3wHh{-JK4vWUgvGgtX$7% z5UUKlp{5}P;*st%nwlvMu(gjHSo?2_j34{-2Zz*%U z6Xdw~lW%AT?A(8rYr3N-`Su-jmJYLEy=Prr?j;jiSUDBNZ;>=B(KNFKfjG`$_S3SdM5d+0T0aq-)35O7e`y$G+WebAXKY8jzZQkA^S#TD@ z`10LqE*GK4P0?)+D55-hzzOdh^Wg?X&MtGZ;@fLEue!1R2C;G%?@H&bvsU6VP7e3j z?A}|3<^2x>{xHK*Y5Tyfy)Z&IN`pX4AG6*-L5_tII-iBqWSPdZ=O^w?jorPIS)$D( z1mM(>mJM8n-tt=pv+d%cV!D#y9|^E z75YgC`dR?AnYNTiES~?M<{j{Qmv{?hwhId>v-D0|{A9@`7DgP`GaQ)f={@FxJ}40U z_P43c`!++S#Iajq>gD9u(}gUTMAqF>P!1WM`j6h$~bgsNc92bcQwZU%uii9rQ`PX zaUEq(UIC$&q^DYSk?eF7to)XE#aaAUn$NJ8W1O3Va%aDuMo=eH77` zL#eDlJs>=#Ah6>g>00C4V=kqQ!)sNa&wypD?8pd<-GEehwN%%NfQQAFl|XMwJR? z&WDx~!sq)N+I%;8BNUi~8HQ%bK&O$4-%_B%YKVy~cv!IEjzpGF8R>F9?~ce4>*(5* z?PT>W5K;mA(w9%07>9-0n-61Q>2t``RpLEogEFf_rM-)xGg7LLD!c(_HlaP+f*oH7 z0iSy>8@>|N+qSWRYFOT6H1x;We8A3i`cjz(Gu{%WH%QwHQDD|CzMH%iu}XVugsRMr z-*S9w)}r_UCt2XsHNSU8nbfV*+FHZsO3@}QBF5W+`D56`w3t_zMO&L0aIzT>P$pol z${_zw=%OUlUdTv!HpR`9ZQ>gj;Z*Hr!}8vP z73tj8V9O^nr>2KrX2QJW^ruX!x@u(*v%?abqh^~5q0=+S@PJl>;TsC4r{f?$j0Wc$ zUjxJ*DRDQs33Z26#^i>W7&ttMpuTX^^U-^j?iJJb)S!jZEbNwfp>^eBU&USewb!bF zz@9VwQs_kLRF1H5E#cfHaE(!Ej&}2k60`wq@@nD@`YZ#I3@lL#%?T=~*o%~l@IK8~ zE%3<0tpNwc_7t4bq47o3&Q#BTI5q9SCri+#JbPVYAKcv-WB4UWJ`D|)Rd29gg$O=4zsMTPw!X zX)B*bX}AKp>Td!F{0RO06VyfUx!eZv)|i;2(gmfqoZnj0O^<*pgmRZ~cJ2NZ#+?ng&)3G+|M-}QGBK?DA@p00Y+n<} z8l$FVNr6GEajZ4+?*aL>pTq3*2yfr5QhPp@e4UfIl&Fpy5*Lk{vCI?dq7L0Orhiq8 zk&LZp-p>*$9-dC*`e)51O|O2AY8=FNe!K5{F}I@NEP@_bYr$6tyZu{?$(#4>%=;zz zU>y}1{Z&>lzF6n;N)JmZApIvH57@~~)ZfaMztJwec|KG@V!Z2m_#p1tw;a08s*)g1 zA)%=v|MUsBBd1vX<@kg5`vrJDe94PPm^ZMG4mm5V7`=NqLcv1`bh(_cNhtfoH5DH@ zeF`dOx?&u1=W5qCj1yRO`rVj|QABSU6vI?XbEzAVMU6p#bZh>*Uxcgg5<@FwbI?Xb1P{P%PCa< zJ_xxiuY($9myf@ZC-me)Yz*gh-%(`{8EL1xITbd{ySMdw$*@x9PNs+KkrWFBGBYbf z+gLpKq?JD!@RdQq9~vs68%1d|>Tgl9Rennc1d(6$ytugl-2n%2YOQZt<5XNJo?gZV zYUijk`X#bN1S&THFlx}lfdIqCbHLi56iX~DC$NY&2bQ135QGtJ#mRS}8Kgv%(PIH@ zI(m3{aRkzumnxfVYk>!+N$Nb9O*M(*dfoax!6O0LPh#k{`hYoW*0A&5DqmCj3$vEh zJiid3ZCCo@bA!nq)9Ie8QB|4MQ-L!(f5>-kbU}UI>BexyA<1wt5XgV(dzJM7BUI)+ zT)T>l2`?^1(0f*2^l?eCYEbh%VPWoLODuJwy^NqI(d8`Kim$3C+BFzmE^A$!-pZHl zrQ)YcmMJ4#QelT)2ihsrTQ}#wCw?+Of=fNLtw#wR16lFuuuZwfVX=+(8Ul6Mwijjs z``)=-g2k#*yB(xVYwhl{tdOP2@sw`2(}=M39)8`;uRw`*xjPXOaivnnB=#|}eu0%3 z&G{o!RjAZNXn%u=g?)q#`F@yVqskQ%tW2~qBy&@!y;L3F@1R5q46LpO{J#0otdHJ+xJ)?J2UXT zU|g_$_8K!G;12slPMvXVYCzaPCbOqbHm;;qfCUKMl^T=hKAbJ7LxY8ir{8=XCJfb} z^Mza6LvdL3DYwP}0<*nkJN|7Y^Al1rFI|t<%D_}qk!Zu#F|(y7gj4+2UlbF#BGx(xyTe#j#pH9tK z!#kmyNo(G+;dW=A88WyVaBhu_Q|y~_ovS`es}Y}Ivp|WO>I*3aok_i#h&%k`!$bU~ zZq$t>11v`ePU+jW7h z?7im8e~8Ar8gMjdV;5nX*eIHov$ucmgJ1_IUm&$3qkT?923y%ecAAlAcDTS$hwjeO_`rRrROD zEMQ`!bN>ErL-Lszg5(KAbMAW}4vBgt011%SrSDE?0``(sc3By+j8Zi(bK`}_Si_@a z+l`9SwFotZ5X;fbW0HTHV*HV@wWnv zt)j?(k_J-T;UP-nOQj4fw63jf~fjYjDw_|pE z^g-t$TfOgP+PdSDgn((u9ff_W2ffF+8YEh#Y7h?ON4C|A-B)~EZkVWbW!6*-Rgqu& z`00xM9TY&tpF)4oCl?;_Ij^swEUg!eq)!_tPop)^sJR)#vx)~{;>kVV`MEXv?(U;%OBF-Pt|>w-e0-pB1PUP5Or^R#TCDyi%d2L za-F|U+n*@HtPi!9y{+mCcY}q4Y?FN6%dh%=ZdYbvDZ}cUwcyQ-9J|lYRFC8!2e40Q z3P{zVE-V{}0q+(xdap|Sb+mp9hTE>n$WI5JuMa$bC!gf{y~rE)|DJOEOEpVGf1G_= zmnNupPEFtTq3Uu`M4b;e9(NBrX4sY-V>)6b3|}3#UnAW#&e+!haYHAW$_+N2#M0R# z!{Lh!W-s=7{&L!rFZ*U}|IBK@Z1hyGP4QFF0i?d=`1gAC!W8WdcnB2q4|TqqM#0mNDRI${M}vkNfuYtqHU_nwY9Igx-J<{j6N@Kor@>{AX&P| zB%ax_I1`GXQ3J5_dLmHc`pA$bzK+EAclXQeL8)JeU$Oz}(S_^(p^OI>zFedmbE{9V z_A9D^;(s!6w`ryd^!!8+9FU>7->_!V_0~+qCOsL|znH!WgtN4Dz|no!7_OV`-`<4| zi1ce31N1Bc#Tb#aWJO@5zd-<>{woNeY5g`g*u^$@!CDubm*V`}N;uEBpfUU+v#H}X$tZUhXuJng8??J8>j5Ya=&frJ zYL+@1#9m&iu>VZ}c~@kL0H|RYyM>efI|2brj}j-W&NxWdCZP;7X$b=6dHA&f?G7`c z2@mUgzO#lZqfOaS!?osr!U|trE=n1nkbcPyYKJ8NK*i~6e}Xb)qok4e0k!fBF@f;A z0#n$f<){6_R{t54kzC$io7coYv)UK-ygbbFiT_~=<;jaV#@ibvHn5;M$-1UH602fp z26m+Ol9DqtU*h_AAfgR3+7mYEo&=DXJ=FuoZ~h7rIu&DrTj;HtPK$WX_y&~BY_1aI zjbMkeVdQ*y())ivgn-GxIM+z?$_e&iiCXt^m=*jyzPeVfz~hqjy$;8KN=GacssF}f zbFOy8L$$euRh$ACO}?6!nW_$w1mTYZ!rNOMEmZ7T>{UMZn$1y%)WFjY9}(7;JFO9r zI{LyY#62SU3jgMV5v6Uk<|i=Xx5fQ3FTy70xljZ!J`p92p%;yZLmMRT_b8hqJZEWp z76-Jkt8;}C>dD~{WDMj7WHJ%gzh(KLx5cfYVuyj|33m<64lSvg;!?ff3H>Y%-dO|& zKXj`jM)b8w{)2P7)MezFm%uG4LxQy){I-6m>1PqRXBiV|)8juDpyFvqM9$ z`3ZN(58S8Q$EKlC9ys&>g{f<)hSw25aNn(Zue__y7H*Pcc9_k(ZRUOOeDJ;c%-CJA z4f&jy5D&vc+8DFT@D3|2GUF}S9pw_0Uqri%O(az1()CuE3w+2Q(5rUi)KOJ)L^}R7 z(4aYeQe_4S$`ZQs_S^;i;KuU#H#^8LFjJupd5~jG14I7?xZoMP+9t^@V!T~>l)K>2 zZp5aslv!D=9Ju0yy7ZNVF|FnmJ|y+&4+}C{ed(!HDuBG9TPn&1n)-F{ zVyDla)D^DI3)Y9v`sH4=4YGVTlG*6)#zx-=gs|dMN z%ye$+^BJrjtqkeEQyNuev6L zy#_&oQE7gj78dM(#GUL`-SO0zMq@PnB}3C^t7p_s_Z^_V-pdZ>1nBKLvFuwtvi}46 z-;nd)LH{Qh_x;V|V~pWo1qAsSk^5#$t&z(`=uZkWa5H!5tlbN` zSuMjD$=WLXfUq;(^@>X@W2$CxsF#>Jt6lnEMb)$J+uUaF5j)NeWeJV+y15*fZz z_`>&QnKmra=ln&Eq`5YftIoq@LRC*!J1(Pk$t*!DF_9aQX=ehzhYIHc`d=@E-5eTO zk*?-nrfkh>M5he;Hg2#GtaQ8Jb!sxu!*lb<&&X>j`6Fg+`7a-D6cP+#iw-{#z-0RLMWm@S zTN)_2T~1-vTIk1Z@~7~eFS3W(j^tF`NOHLwQhp5dtg@Ifa^!~&G&zTWUh_BC!}jv} zr^CUA-86gT!_PTAYXud#?2;reCfxm_L8**D3yqyC#4V_FEQ8*@zvH%Uzz#ax@N)-u z&n4CUcRLR}aIm(N6TAVtEm^>*nrxIHWp?bNC5Ctv*f$ysIgPK970VXR)(tgtoCA&K zjM}W0zC+PV(i2BpllQ_4VB@Iy};FGj5qULo(L~^H7HAVG}j2-sWRWRFK|B|+{w1@1g z0Ytjz`Ua2)wqIEcO-CZe%4Bb8_S-i0650*)9F3XtKKcmHWQcP&7^S2Nf5vS>vWN9z z%7ePwAfyi0?&9^*@q(I>J>k6xD9IQowBFQn%P?eV7gzIqcFh^ml%l!*) zj8!`w`LVUuuzQb5J! zw$CGd2fszThZFdHxsAbhEg2!jq`Q2PZgYE@TNgq>r{RC6C+vym9Ot`?ijmqF%;OLo zwugAy*`)eV7X$5H`oFqS%CuBG)x@-~6|Gmu7o!!_0Z6vZ->46cLFY81X|ws`O{K}U zlf9GB^_oMu1@w}YL~F3Nr*pd{R5>H*5*>_=X)7&03l)0~u}wE0-T?~gP>t<8Ex^HQ zqq7MI0m;Oh>{lurouk5Czd)xmx9N~WQGsU`F3e6J=w)}%Dow$rZtEm( z`O;!gY`hVru~6?H$GQcKm5}+E&=U2+&UXI2{BvT)LSm$6&K+Dj5?t4aOXdH1^vN{W zbNi0YMLGH3Ks{VPvt>@7JCyQ9VHJj;nhm)fE^n@Dmf3I(_2f1M3k8=yjxkl~D1Z5h zZwh5ZuIM%6+aV;rs64^0WOj}V6x-|+y;qPevV_Gh8TW1H2DS6>G)bnMbcnj>(7YB{ ziRv{G=~}!|Fqc0&G2Rs}qYY(#gfR|f7k~OPau7St-jj?POD_fp;!ZBUlfAu_%VNYUL-`rGZ#NGN$#?dKBmLZ zYBfJ^9e1lWV4SVvk@5@gyDKETGyCeBhtpxi2G1_Y%{W=|Fi=N$b2`DQ%tH zBMe-L@+5he-jM~3j&%m(>#pV<2X1X}MmV$e0qX*RPM(kQ*`3$Skvg}SvUWvp84*kZ zw`rPUUK_0@hNnqu-|xrVqT*j&7pzO!#ikji-*!@71@Ie6&1kF0@~yLe@g^D6NficB z*(@T2<~*J?_p(9MpFOru(7z+j3krnadrXuzxgxEO6a(F3h_!u@5~Q;^+O4SbMQ-?f zrvshdC34&Bw4KP!#Tgapjg>bKT?}a+$xR&jwYyE`>Muk)V{5;TmlFri@F>u0Wn3cJ zgfxIU(Q&Zo&8L-%YRi0V=ANQ)$_ra}+fshvc`ccuBzUXCN`K?@m#c8Gs1lhSlF2o6 zj~dTLN)#TfRJopx;njS~!MdIO9^x1Jxx42|H3cs5hiQoe!6e02^ITvWkA?vl}F|+aAT@;=I$ua)aKC9%aju z&RFN8&8Oeun;S)1znDI@xc>*iA0p_9aV^h zvawp%KZ55cgI5c|&*0cEs2st$ncrhB0LDQxbEv$T^J+-O@aY(`*xDh4QB+JY<8ZX@ z*w9)(0V<68H+giUymsdqH3+R2(_;({-5Z>ol*stnbO+~eY!OwHc?*uPu2DKir~M&6 zBy6ATOO{*%LNE~U3e)~=q{b*OOJ0=oE&y#|mf05LwGwknc@@mS$>rGyR~z_@oWDCn z`i&$5q`(@$(fViF5$gn{{l56=ym^L_ETbx2Aja331NV;FNcWGlcF=CFbj&FxaSXt-(EG;$f;%^-V)laxSH+U7IM<{_U2KJWT@7w!2 z$>@~@{N9W5s=!OO<=&-KmMHfRh%>xq_0~#hR0ISRn$5`W#(qgg6BC>(>BgY7tm}+s$Aq~}Z2Z;1(%HR^podXF905fZ zcOf@+Ol%Rl5lY}=P}lJmYnoWH*qPqZYv{nG3?yWxTE z6KDk01AfxvBd~%rljT)z+}1VW%fEB=0{$FlwQ%Jf^dx@-ew;7JS_g`5wYO-XoBG#a z?=9I;syfcl>Tgy`%e<6JW2<*wwGK^_e92yKEFGe;^O@QY{uwaTW{Ewl9T-ZF;|OoP zP6s{TU(xAs#&v%FoO2R|W-hr+l-vb3_I}&w-5~r55zW{Q0syC|(JfY^Wf|h^P&@Bd zLgK$5HkexTSSQLue{~7}R5FFLHNbuD$~(#>NkIfdLi;HfGTUlrEur{bPOn)=VUm*o zd!$Uxx;_PY2vpdix+jkCSJ|=cdb{TeAD_IU^Pad0#DCkOQOadfy8VGeiEtAbPIx~l;S4Q` z&VwGPIYb=jp$yj~Y9;0$rfPj$_3pO_>t3QeL+^{d$v?^FFFlpaNEO}Ma zjk{{z#kgnq%7PkuY9_Nzr?`@q#?7r|M=q2WtWSjAk>`I#JV z^V_mLlV!!pw^2XyGlk+@1NcfYJ*Z6?dLZF<3b*o@ z6#lZzwJYyRAA~W~Hw(33mCv(21+6V)`!$wfVNQoHGSxY|CRInLCAjzZatZC+yHh z@{1$Q)3%nbbFsM)F1s3@DT`HWb4kIRfh35b*f?Up7UMnarEwdA4vKx>#klJDZo2?b z^-4KgH`1%d(#&&09i<*V&w;Qw6=P0 zkp+ryy;V4;h}h(b|dD-AOy`CZ@mvpeuOiq#gC`my2ZM(usaXzdCmJd=9#x zA)#}{YP&0K9;xNS&tY&^trQ!o4r` zcQ0AEu|UkFe`i$bYGiGGkws$&w8^6y#dPi^2ft2b)S{9)H4~`#!8tS!3Wub|%y)w6 zg4>09mOFV5V~%wFl}KfkwGAPLCjqEfWhMxqx_zGfG;Q|!s;LE;Q6}m=Z>3gzYRL&+ zEd~&0|DE&(aG6&40%bngmigcz>n;)dWp*G`KH;@eocSa?QCA~oE3@T)OFW4cxzY5U zP-sL!_8H^65e|OE4t#*X&4?*$c{rq+J^0p4%ZO*h{x$9axLH(;(q`yldg?N9y_p{^htV2!N? z2>Pb(&158oFi2K?wVYmM5dJ5r-xjSpM&Lhb-)_h4BU(k85VzHg*>D<;!Wy8?+I@&?Qkg<8FH_1L z0$UK0b~&c00_FIX?|eoY-sk|HSpR{g=@Hno&lWlSAVvYKMcYu~I3id9yWEs}-o}BMPkUCx74l@f z(#M8B)=RDUX+&k1R4g@G(&=`?XcKU&4nS%|L^2aNEM+jZ7XpgU-?{qs@KX>-18V_Iq8pz@}p;(twiO{0keEsuO_hkjvw%+be(5<(xG~L+n-^% zO~7Yz@jc0Yc!-Y=Zfa6OFs9GeuZcqn-Q!L6%T@|rGEXyOi}}9erQ9Cm>~ zJ#D0#<@136Pxb8bw8q&P*cPI{z?yO>u*^w+p!naq2z^WY5x@r6IH=NblKfeHdx`F1 zi$YT_X~LJlwlp6ZN5x>Tj$W8&@!*f9g@TF~_V`rPSv!D4LU$P)Vm?zlbxKDXV8|d? z?MTc87Jz@{)pxdSoTSgc_nR-Jg=;~N^M*|L19$teUPuVsp`XpnyOidJBYIocwiZgv z2KY~$a$^jP2m9PAc01qny%*&cGGMq1u(s0*^pAq<1+QoC0@V8LPAvK9xzJ|C?Z&~6 zRwz#0?DIctZ{E6gh`uVnE#CQHcadyOoY{fQlAKNx>5)}hSByhLm=C` zR>e4#I^)?i1pykj{aBr{kid2+?f3ti9{d7_b~U@_{uq1_3R ztxFxF3ZWkInBNnZ3aU4M^a%tO-OTc?)-ciK$ipCiH&Mn2Vk0!_%4gsL5?}(`r!JK* z8{iHrhXRlMQFWLG7=-;V$AY-_9!#+dU!^t{T9Dh62X4NZ%!*Fs*})=irF#O8;aMVL z)unCY%jL~Oy=yAQ^;p+Dp4ZEl;{g7^kj=6u@wr;TKX>z?KtET)(%HX*e&OQw=@giS3Rkcio`b8qtT4|95f#~1YXv{^@EJQFGV`P=VK zwvQUMY~qj+%iOO=3{x*8Pks#ga$^Y~%PHIHG@K|nWxYQ4nG1CtRvj}9Ea$UQKGJS3 zB>CT(%G+eAd>}$TGa{NefQNWy0G%*8X_GVZa714|qdFb)+C;k|RJf=r46p9lxm+*~L+IBL zzvW)-P8k3?FaTNd|HyczS7KrZ3@AzE&c)mkDg`2LWOKq8nXesXK;a4ivYY+@8N)OfE_&Y)QC)5gatizSo_9Rie6tAM9pk2qAEly(v7HC zYFy=ddD<9Mio8scw#&QNb*Svp6x~$N8{|Kcg{K>7f4`jZGlMoE{c0CV!?$@dQw(yJ z`Mxb~?_C<}3Kn_pS|zLD|MlhBPU_{XE@#8;F=1nx6EbbLrYDyj-5K@-opy3)-Gary zLIg9Ki=}&jYZM7SUk8OOgKu1azVD+<7EHgqbuCN>pnheE zOUp=uwR20K!s@O6xRc|$N)%Eo7dA3r^~D;*h(MRn&-OeP48M~zQrjPAkG;r&dm~m| zys{xUtafw2SH?Bpk=k!t-I2e(P=lE{VORXqiaR#jz0RDT{+Z)xbTSem>1gt92B%g{p-#zV==v==$UUk+B7QD4_>QDbc4MOAHo0ljqx zp8k;GHFJBvVanz}&f&z&5O0&uO&y*&%rt>=zNxfgX*hyZD;GkAi=BdK4*ELJn^AMI zA5RT0&ogW72R0Ju!Z!f!f#z z7rY}e%Mc(d5sT7yJdQCADU30#RZ3V=AA`YUZn&vmH|T%td*z0SAwyPKaO>KF@5qjh zVNn|Ti}*y_292x1Vsp)5c=6WiGL>l`^kIgd9?Xz*5va&d$(^YQN6!2X3sIvft29ml zXy!$~m`xyq`^dT)2hhB8v0vxgP`l5AErWxLE;ZysFS^*fDZ#*Y4kG-|{bMxvTi)L2 z&lLl13;_M+T_{!fZ2p4n<<#MgKR!Ff7^U*_OS^1XHJrL{NE;J2qawPXf3fSSY(7|b z$AQ1tcE&##skX7d@h65=IiGcNP8=`rrYwVDt78h7Md<9?_sWO-85pC~@)O-ep~ZBRx9ry0q5jbzVRF8y0t-DZIXor| z6WjR1v%xd<#FLHmxBcxc1#`$8G|N=5oeTNHCY!)D<-9wprJf*=E6@r&(PsE0 z*f1Bih6n4Z^l7xd23H{Vu%HXzddE-9@P=c8i5;DDwD5~Yr51sVS_zc~i89?OQ|8?~ zcg+Dw-q@-?Q_DZEMchZ3sCL`0)(&qhjKwhVF-BB?s@|)tp080cLwR%T<`wO>ZQzu-Mbm)ywCL$$Z(cFOp~(f|rshr9ndN<|j?L{oo8J zKNLHAV1MAzcutNCQVZb#KVK%$OBnvTsaYS zLg>x7-@o5Qln$|rKSJHU_W>-B+Bpq8%r99dKG{Ej-^##$3N!w)`9XZ=cx_2;Tlv~s z=%(M2N7bg(!OP;!_qm}aChwPNU13xv%o0x`9ONfqeixw0syd_BW9rf!%=YT~kRZSo zy=S#^0Pe9qVKqnVp*KOz&VjK~yZZw8vfE%^9qz1Rp0H`A@S;(bgOC7X;wYGJ)y08p zVpe%`TCO?&;qlt93CZ(_+=&qnV?}9O6@tvP;JV^)G ze=3)kPqm`#{l;!Iiw!7lu5PN%ByFt;QoFD)HJRx=Wm&Ruv$ zr(?AnaAx-F4M(Z{o1-&p@Utm~NeQ(vCVUw4G$OBQ-D81V{}>WN_VpyQ=h*z%u@C|W z$j+4+elr%pO@z3B#9y)LYT^wbaSpGG(Ie9sKtqp}RlhBg8=Se`{Qt4`o?%UGZM!z= ztGHA|mlRO}#R5c%2%^%WVnC1DcHh@^UL@Y>Qlzh<{AttOM>6C8j{%d?s1x%Mc{y^4C;6AzW&*F=KrV)1ar))}V7$D~f) zD{0X=_#GR2;+)QiY7;T~*?X*qi(NR)d!+#84yz{{)KQj7cr+K_+Y(dM*y~_()||Y~ zpkmyj6BxvD@{jW^3>RkQJ0>1i03SV&uJa2@uw>X~(fE*h9h5kEM>53}>@(q{Dc_9f zqD`%{+u;3GoKruxP@f8^O(?ZZ9zXO{O~9y5`e7(DFi`<*JaF1@nFU%;ulJaXGmR=4 z?Qp47-&kM6{RDD>vC6yE(i3Sob5{9qxS)7pQA$2Ku+zRU3gtWMZ7rHFQQYBU9d5WD zZnxX5I6HVk7d|2f`m3p)zL-e_(0{EkqJ!UMuA~lYNCr-kI)yZAM1}EcS+M!pP?%WG z4uiS6hT%z@NOKCRF#Am(-|rXt$pOm}JqLG&BW{{MFhHEb#r`7A$E~+P7G`#w{P`Z& z9FsPpCuP3;%>lvHHoVD;c%r4CibsvKj@__ZCtpp~U{w^qD?EL?!;#;aA~)+R_MCVq zmA0alS(3S0k-E+$*sTVf9`8t^LXGD=cT{7RPt&TEX#mXU6|B)=BnnbyM_BG?Wy zcLhFWr~_5s@~<4i!}&K~WfH=kWY927@j7mxnuO3I)AbAiAi+g1#)Fp4dNabeu`a^t zmB1zcB^zI`eJ@MLMz2wRUZEntMVaD%wztd8mtmR)oW|l+-_o{6j(+@|yxiIZAi#Xk zS+F9?l*U^xaP$T3`4f>fmMm^TAn!Jp7;oGmtUPE9&y2VvK{MI0HR0WGM>=;bB=QoTR?s z4B#aYlCo6dl3tKeyfy4fdoY926WLK~i;@)ce@n}7}aj*`c$FEw`DA+07<*}^$^CRBd9rojUQ|2KPuiWwUO`hE{yN>lf;|5CDJRx zLk9%&k2+^rRc&!U}XHmdL z&H7X;__5@7W_vH_V;SQ>v!>PA`$#4G3ku{tE$KKld9{TKOG7mmyOf3EQzo{E*Ug|+ zAl~++L~gl4RE{y*12G&Plo!ibj9lxBQ2_PF6s2zV*=n;NR(P26)Tkou{l;;fy$Z|? z+Monk1L_paCAIc%YGhf2@1wh@_sk#Et%^a`%HZ2*C+&9`%#}Yc98xovI?_fvNVRg) z_}o#VxUFm!92{1(%=*yFJr~&RXcTMkf|JRz?zeCDxG#|yeUBk8n$aRwn)mKKkkcL5 zfT@~$5tO<$tk=$`^#km-*G$R!SLdmp(g1}ZN7CYVCD|;3*=EC44~LE9L~q)qllPy3 zG2H86+r?kxzRQ)%+3yy}F&4C;7iEw)#e96zM3E=XaF+}Wm0L`N1o^m%b|dMcJFIAV z@eDgT|4<>*Xc;xGYdHSWr5*N`OV%o}G`e;r2&?4o(>ZaNrhjr`9a33{F zBt?fw2h!e>9MR^B085%v8V*LX4+mdcJ=k^DlAXsPZ1ZRS3z&hJ z*@7&5pw+Bz;4EO>%1Su3UT=V*c&OYVCi@7YI7NL`u5kwuGb_Jg=8)pa@&Hn_vq`{@ zn!WKox>5$I6M=XyPaWT}W$qQfje6g`9i#H(<8p`b#1aaHkC*UKnD`6|7xNM}MTCy- zB^@@;qW3qS3mGdTd0#=?Tut}gT#TT7P8%)Sd28UN7G6fT-(}ir86goo&Wu&ArJMbc z$Up2&1vP-Gvu1TFfwbHp`Y?OEydEY$^T!26!}0)W(WVutcY)oe!RMEQ9r$1jmysG% z-y|zvt^8z0A_oD3gfNjd>o37KIeA?8r?U&JalV|J>mb|0b6FGfKA(z*XOhGWN{6lCXeZSS zyQ?fu>GFbk^6_u^Wo9IfQ9+I)&|<#+*}JnRlo*XjcM6vCBrM<%#Vl`dCqWBkD33S) zJ)U!q4_7l(VS>{DjFKm}cyje)By9D;vA9`NcDcfTshcVxSMTaBI0=gfnE?4~bPvfT z=&q`@s+Uy8sD>H|wV`h9m7%oXXsdU;2ALelZP z#_c~162OIG|Jz0r9Qt@=l;G|hg@nB=v3Tuzr}d|XcU~4REY;d>gwBJg1qzgU?2G+b zD1>1;fMLX(h1~#?2gjzJb!O}yrKJsad##+>fP{Jdk^$#VtSGvh{p`I2d;qQdTLMrz z!3++&@xXT#Nhc!(r&g{SKf1+V$1Io8XNl%+Gxg+6dx|nxA5PaixpJDqEV^l&g0FD= zsY6#AZa(9B36|}XNt%@uNZ1;L`V#h2CV!^M{gi#h%)@qWp@|S4hJn$W{i=lB);^)*Zm&adk3D%kHTsSrc&;^OsmcU9OB$mUmG3~sXOUQvyul;-w z`!F4wvP=!|<|8Pz{XTcpa$4B!t;1H}_-MuI`S8<@ zJbxDgeUkCf2Zg2^@do9MEWnBCrZ@Q&bC6RMh2OrpqNUaU0Ul@MK#2WN{^kzuNYH2? zUju}^A{)2P7JRfyHHGu!qIk*v5;et>88n)Wn(9vn)8x}b>blB?*{0K6^sEiGr+W)f zaHNLv3w7z9jT3Pjpj%5g-Pp^s*J@Z(qmqZa;NfV3f4{yHsb*wV;b78qe$O1KfMs3E zz$%jZXUgP(NyoU{pZB62X4$DVF%QrzvF+f*qYaL5bE8XukVj&$$mZT+sXXd`hf)=R z(!UJDq`7aV{AzRNqiB!bj|H65lSsz9es?Wb=|vO=y$0SX27DhV#96mOp&x81pmDGo zjt)pUH=HF`oW--grDRGQT>4pOHHh8H64+Q*QTmQVf)tn5lk4EQI6;W7{_4-|d(@4j z4}ptB8)@dAM6H}p&ulP7n;zJH`<|v1|2|O&kQ||Yz4yGGN|VYj_|4@j7Orxxb^#bH zsv>2?8B|q^lK|gXm(j|)ebJg1@o;K-^S>k;iP)Egu_Noit8)G$aA)x!D9?9=1$K|> ztIKKxeP#ka+-aIK#@lY2_q8p8;@4a+#^%|&9%TP)@qw4+2U#q8K9;474j!w_ca<(u z9w8ilRYTI92o~Ix{>msr_;u8xW`xc})Hh)4{0b>KD#o6bPO2VBFnv$hJ=hml9q3mp zQFEAH$obNK+ix_5GxHH$s5}-lJgQlVvI>u4{A82`YUfue<$>Pw?tj@yt_T_?@|!r3 zxz63Ae9X>Mr<2qzyL3MyPND&*ENUO({$#34>s6Nu3bSS?Q6xG9Bdg5liCc4#-b4xV zi=Atw725bnLbT6>DU>?GcNr##jF>M=hr-seb@v;VE2CMFYc}(K;;z~Rd){#mLPGBU z{82u&82tMN{iQcnY3D_ zIPRt7d0>=Ix|U~BBcys?#&ytd(u~&*%UY4n|hP!YXe2wIS&N*sfcPKpymW2u&raSQ|XtTo3)q4~a(CR2{MyV5RhjmR!FWQaW z5vj7oF5Y)s)vifKAal3kCfJQs?3R?N;>3|PO4|F$A=P`q&3o)>jvzXx%@VnFy0?Xr zrTCzvkSMUz5)s`*ZBQ~S?`~LdUU7xDZ5a>6tUld+>zTC0*TEl;xqySbAif=?9SaKqJ;j$#=h6_lhCRlGAN zmmG13YGK=w$D{fLla6xIsqRHPy)fZ}Rk=OIH8}Lf4~)4*>caHp4p$y^^trY2I=mi6 zpRPStM5E5Z*oS}t71&ya0&n5@78^snz9Y6*a`Jh$RGiq|5uB~DNuK2ZCntyl3xiO* zn*2IMVyZ%`+E(OV$C>6p3|+T(InmGmHINoCkWBtSVq5h*WtToc>jHcWm{ua)^W%|D znpj>Yk)wg^0#j)t{$#_w*;l0TX09vr_8lPGKL=KUl>$}@$)Yiu$HK0B4}DFu7libj z&%o&j-ObUe7v3#uG;YY43+bt>o%1CH zP6_1T^o#>I$S^aZM zOxSGjDZUA0XlTXRt;=qr%y*NISO9Kc#TH0#o9K}J0X~? z?keWcY>pGWwUImY6kXuFTA;y9$uPTn?1GrH%~wy>To>WnGB5AeL7juqoT27t9YbR$ z`Ja*k!}sY&?g9#{6+F2RYOY0%y9B1%%4K$3wm7BWGw(5&WEWG@1h1aE!#5?}OV~$9 zksqAPdC?GOcg1;fUa_Fky|>Lu!<>(@AhT5MmbhtDvp4!+s$p!-Y_Cqv`hNUYoMW|8TPqdI#kX6>O(>@HS|(v( zd@+mz_F`ypcV)r+<;Huveh#NKCsDNjfTmjz;^yS2oTzihrwi1wsi6xoFwpwY6c(BYM zsAX?W>z*Kf3gY$11fFGU=m_Axamy>QR>o^4lr9uVhHL zt0b3-5u#*<$qRjqMHcTZ6%Z8AZPJI=bG{RSii5U%^CHhF4R!JTJj02hB>1-*8ZmQ;;ZRln-UE^;?#$^90hZ}24{qgotaSa3@|lJ7mT zY6AuXfrRO+Vr@6psj)CQr<(hKMFtjw*W0#f#$6V|98$wp6pZ*_rHmrD+q+j64%N(X z%BH@$w}?5xRy=~tTyupRavV+IhM`-kQe4`y^Ht`VarU;%g5|kx+;#h$9XM>PlfD>W zU8_=tLn}4UNd;W&dP5!8+O04it{Cc_&gYAiLYCI#Jpnt{GhWq7)nw-rkDn6u3i30r zXTXatK`@u%ozJ+oS;XV5g6t4|O=ok)XOs^f~te4nBSXiOD(`NVSI}q z@IW0qhbc2F8hUjfZ`_#8R7kExD4hztTBKF^PBpfERb~o0!?#uRepQ!GaDT~sG#371 zy+Ec@AxxgP`)b=`N-Jr?0*js_i2islRN~yZtAo77fq%OS-fUXrz~vl~V(dF+>s$D?Ge$e(KzWOfcuf1 zP#&ZyFEV;OVPHaK5(uFa$0jubd-ADabN=z_^oJ-d7B6QpaKj|~33B`2+n7FmvBz28 zR9~{QRE}7uKiA(!iVP{X8l+-bufB8D8#PWc!pBigV5>Wo2j2bEDA?)H&GKVSGWRYt zVvGyx&$p0GTloa(C%Jr%v)bT`eYw)2kC~OqD9h}zTse!$ccib=m*m86*?pg)CR9h; z-g%hy38*>uE3OjAQZlUW*X`CAD3PO<&(EI!zIU6Ev@JG>L7-wMzpgqhpKAn?(pU{< zYcg*Cy{$B=gTGOKoC_opl899Be4S!y&M3cb|q{VTh-MTS+q2b{A z{TA=6aPSN7!M+A)+51W(f3ADD_hAnO!xeCXuV< z3fQ>0kUXoJ(kP2Ne~MD7Jb1YiF+L?C0$;7*aIz9_P=cqu13Vo$y<*|oEbhVR$p^q) z@(!KW?LNYBi&_!9G$tYNPf_fON8763%$QzJlsDuw26Hn5l)5W9rIGu;9yvqd@LSZQ z0IP|aZIFgmJbj%z?@&5S@o6tI^O?Uv`+!~eexsH;h2I^j%cuNBT6Z1wd3UczWq69k zy-WAd@o29&AKgLnbKrDQMH>{h@U0+{+5#IQ1rwg6XxSPx3VI|vn09;4Li^Nm>KT|} z#=Vcx72hK#6e%ejeH4V^k-5oH$!*vMQBD#E&b+>r6xIjWLUCuP*v8Dkf$xJJ@tmG- z(&*85mu4GYK^091bMvSwKyzajdJs65dOd3&6hNW#)xEJ>L=SR(yX|W$)@3*%%jBke zE5F>^QVfqYj)VKqL+tg3R6yY(uw1TD9qad1I?`G%OtR5(j6|Prs0+2z!gkFH^^a8^ zOIIh+n=TMJ3Mq-Z8R{5m(YSlo9=%5uW+(3WEKE(jVMJ9lGlzpS0`5KR#xNJ%w)YR} z(Z6dnIadDtRVe8l#pBHR2UeHFtgRWuPgLGaW4m(b!gP;Lqt|U|_JB&m!mR8@jPq;* zN(;W2TfjPMZ?f>NCM4U6IJ}i53;uenrA!F43|g1oX-ZxiLU0*p7S7xx7Y91VW;f|=y< zwPq{Re??EQ;6UwlpRcHfa;wL5e7`zwUxl_k$+KCb*vRHZn7qI~EK2Q9t~AlizuIdRl!!AhPkEfaGJ3s{EBtMk3>{KEZyqwY(-HA2ad!w~mO>~^pWH)pq)6#KXTyha2p-rB-wC`?NUeJ749 zuy^n@P>o&O1`5OxDLM)Z+H1g7_^iqd8!UP9yh(p9WQP&;?s_rTD<|MDb}IgvCAYIb zUWZj4Qki|MmyVNZ)>C;|8kp?*G%a$h>}-q@Nd&FsMUd*1dEjw6P9A6UVywUH?4Rh} z)fk6xjc1Xr-`=jy_Ig^&NLDVLT#SyJo@j&}$+JZ(dCG=h)v-#-B>L+bX-{4D*Zyc1 zLOrxnvcKDXVZpU9BLCI=+o?Vs8WLbV?V_i^;a8syXqgboDH|Vi#QV*i6Lf`SWIFMq zG8W6^l+wPTvp)6qi%@l@#y~%^D{@L9@BBWRa>^s#s9!q!%)MEQFp^Kr-%lH@X^c0Q zt?B8k^$zT|R@MQJ`%$Q$$|-|5Z#xllmFy7P@~?dXXJYWS5I2L?DUDXt6<(xta`&0v zJYc;o3|UU)BELNvAX)Xb6*YSCjkz23Lw8Ee$COalIn_9w*@YX^B`%pF0;yfT(%J_m z;u8Cy$)WB?9;HMTt&I{3tCtr)P(PQQ)l0z7tcI-;-?Lb_e#5X|W!F6?wYzvVln>y;^~1rz48mM*TQ7 zlL;uf!URrLpluw0blpFP6jTsMP3S&*o|6tRAmJ``9IvQFRv1qee6SxD)jYQ&HkrMUQCxdxog_pm+Ct?1y;E@!xHbLPZ%XZVJqN4! zVuG9`?SN)h8Q0FiR`0B(fAVVV#h@y3Jxpoe`s?Z6$BiA7d|m&BzRvHld#e zK30Ti8X^R~OCKgF>OpLthza#;A_BIxAJh9NrC_maN~5{#{zZsX_IITJhb%Q6ZGu!( zSCIi#g*P*+M=C-e zmBPAL35+Z-bB(>5Ld%ctFzPl^)74WwhgkdQv`J^%P1;^~W2)VKIdPfjeNB$y=(<`l zbWWAsHfW|v1;Mo{LZgdH=G;xeGiS+!=Jc^yFW;i@Ia!F_0;svRp^)KC7>_iWVd^j? zL%uY)wN`HRtKuOwjM9m~t6eqCS>GGg`_D(4=nlS&a+M%tn!qHKZMp4tad6)bu{4n} z63$R(Z|YEVu+pjW0ECiWAh31X438NmV$e~@9XE1556CpS)2$IV5292@#_g4Vtd}Nf zgYMcft)gjmM<67-@FrR&a2k|3fy>VGeG3u%XR!f13vupb^|Y|0y&3#8W6FIr+A{PC zw_2yl|BA*jfO}wl5yNylJ_a|Pk%UCb(TOPW{`P|ThRX-4bFu{bWv)?1O4LcGqFma& zjuY+fm{oL{bh@%|3^U30lyz5hwCw`p_r()NJbKH?`1M--j%kB3)zypMJ{m8G_Y2_) zQ-@0G8T(1IYM2ZRSv#z2^Y8?5(VL}d;Qd8iJ=V6>l_j#l5~)JAyLpg*ky>8$PG)DN zHPbRwBZwy+oFo;UHf<&mAE?P6D3;gDXv9{`_tmimi?;T2$PhXHWc+al{j>T_qS)in zx%{K75NywTRsM-?fZ!`6E1sn0ZBY0~t=9myt8K!yP?Ig#`sKIE&tD0`oOq z{2OtQ#M}|w9T|-E34WGT3RfB}|Bjril;@cwc~$0Mg&Wl??sZq5E#t3cGSN$Q=8jCD z=>4E>7zZaifsDB$6yva6+B(S@eJd?(zL?wXm;#2~H15o#v#Um!{M;EfTJ-6ozb)r_OQ9!{}>au1|;*WfgWGSS$!+hcaFyXt9s8>!Ol z#TPy7{au_H0P0mGwKn@zjTb4dD@+5+4^cITzE9}F>lh;vKZ!^HF^stMP#+#|co`^W_T|lomS(H@3PN(}| zAd0l~t^Mt%`+WmBN;1QKjT*(+PT4N1S3N9eiUlhTMA0vbE|uvwSHJPbo6VA>Ih90v z&^Z|5pLx0Mq)&L9&M*fKSo7IUj*^{9reFNLCkyj+9X0lz4Gg4yWc9xi{y0&vUysxH zU^e@g7jz9=PeZphO`r;iC#8?s$(cVTDPmmadJpLiQ$5(&$!eUvO(}(TTT*fGxS9CN zjw$&}pM>M6xP#T;f~YW4jmDXM1S0V*;I5(_N_}fwjo2g+AUWyj`-Qtnb;D8=D}Z7oJi#uW%L3Ljlr9rJz-eKcHrUKvz;Z_8 zjd5%A>6cX+!W5h5YzQ8L0y{Fe>WU|Z8MVBz{e`sZImR@$48kuREZ05WM(+ke3+H~V z(Elu3fRfd0Yay0VCUfsR*E(z$O4Sgt$?Myxf!d7Q(n-)aVKU+mm;u^O*s<*0s6*T9 z8}_?48r0JpFEq&@k>Iif4&fh>{6h6x4&aj=3)T93b0*EAZ`dD2(TT>L!rG^YrD(W* z19bk{RRo8~_4btfD5r~owtUWsf(5s6&n0$+o_H3R^# z|BwjQVvx1=Q)%H-hu*W^{O(3t#Un;e0&j@APvWpnXXXmHyX}2Lyq!+<*?PKh=?VeH z9>|GoXn_W21SviLrb=3AFrDY+Jt?6C%9a;LCtWxA;wTHg`9AfO-M}O;gt%$%DA&6i zFuDqJ)q^yb>dg&7_PO|dq>=63X996LkN>)e@(Xs(Fr>ZhJJ@c=M&N_ZSNj~V#Gp6O zzaPqIay*28N5`)-0y??PO!bcUB>>!I!5a?UXx5^f1G?hU(968X?Ggnw!Q3)6Dz+9S zw%jVo+$5Ub#IS6&-;&d?!7=0_2=4kv=gW60S8@gn#{CSSkp5#Zw+mX+j~i0Y;}D`z z=G*$174@(^;y3GgcPHwiX$*Q=-J@9=qjf_T1aiVvLUFx-1&K@JT};g~JtJM2wboH- z%6(r>TYqyhgzq%!`K46?N=i<@eydipI>>JKj{!)l776zy7nrT+-<2cva~L)Gg9bK5 zxB&I2ab@vai=UYym^1#{s(oUSi2FDbj%*`vrWe_$HLnwRL$*8OwV6CFEmU8$QQO3T zeR^(EHf4eG-~apW;I9uBrdfwIsX%{wp}{$_=oQ+W&95!re<7$WaP|;Wx@nt>Veg9E zsWGIg#Q~2Xp@2_M--W&pzF!zpQ3(=k=W9j}(dCJC!(12X#cocc4vR9yJvpILB^>KkyH=$8U+> zd%>+5x(-!Lp#y8%7O{Xg54ZbQRk_sRv4cS2YAgBU6mauMDRmbuz9SRA^em+uNu`VCR{|G2lelC?h}Sg zbUN>n+2QRSKU*WM!Q$a@|L6c9=ljO=d6SceOBNfK+eLpfsI3q2r?n#ogaW1}2ogqK z=7LVKGt8h_ST{&}=!z{WbnfMeXqT&MohBz&W!dOuZ!>BgQ@WYVE5X##$SW<8woPLP zG}k?Ul*Mc69xYm%p>w4WCTGB=Wj!Z&{b;FM%z$o9|3A7lNbKB5q&T|MtApV@HvzP! z^BWFzpg=M>YHaJuWtL*owLCAa^{bm7Zqyoqj%Iwg{a7`>Dww;iQ6a73ni=(M(@wW! zG6R}{IOes&D0`b)o;%+3k206%q5`*$adKF-OwVG<1aYln${{i~pi!gq$0yTL?f>G) zKO_fsF$B({cwwhyNSVvTl3A3s;ZVVmXEhxYg-YGFMHkjCbp{&AiITP8=}e7g7JdHA ze06^f#PkQ=#{n~<-H1_=QcSqphao2(-|LUzd^>-%Us0OX_9m~6?2ez+jcmoPcy5-> zLo|Nj9ZAiyYH{3b_+^f4nyaZz_^Z_yJaPTX)P&mK@{V#t-l8%zB{}FD8>fzXHuLbw zchw>nXmO3Q4$l1kEC}G`g0y9Rkr2MyiCkBZtqg*;|{NNAX zlEi(8*N2HE^E3I!?FF_zbQFDjk}(b4pf)D_^T{X#O*~WhceBK8xn&LFCH+u-Z*o9< zAI8h-e{6hYSeb#FRe^LPnlHn|hv2f%MZxfX#lsC79( zESX=G&HukD*<6$1Z^qdN`IN#0zoBtI|%1wRJCt@ey)cH}$j%6Am(;um?p zuh=NCb))FUN1})9eQVPo!=cnTQ}K)y#@&_!)<(#}Br@Vdrq@%oAWUXqNymo1AX^Oe zn1Fz-$|=;?q-zx`+a|*f4EIc0i3}N3DPIt+|Rz#p|^H;pK(k< zY0({=j+9!SW<->;f~$4&SGR?d%qmLJu|ANXlTxv>)#pC4!6h)mb;bUrl+T8iET|FQ zj8%b?iFQfsiX?{~uzFqdLq+|VdF@}KnXEbC?vS&7!?r03hVqprp`D}2^iXHMw(s1z zY*%JLo<8F=tpGrOE&yU*av3MZlgf82T-dnw-&voy8~TIKLi*lbOaDs_*}(V|RLtEq zN#Un%a=z`lcc~d6b@RODcLMQClbXd=J0E8VVN+Jc*})D z7K~x#O266gW*+@b-i+3koZ(^eVihlu1dZI(bgqNxdLfnKCkK;*^M)uz1~m?~zx&P1 zK*9!PaGXqy$ADBq2j35}12y9RMG@$Fr9wO6_j#s-L;S+8{8-v0-VI}uC#kegK!CI{ba z*pMT)mm*~_9@@p;W@b|`tKaQTP5aN)> zvp!tSke-WK75P!{thb<+{g)sYKyx#v865KXK7-nGB|D&d+z1rCS1wl(~OxEsm<1k|+0O*reCzXWl z;n6s-iL%o9j*DVvMA5d4fOq|&wi{9*=rumgC31MD0M||%6T0>59~2tCo)|Jd&-5SH zZ;{^kcdHNrOt+s3Q?f(+)Mqy9TE;a4E*m>v# z$Qsjj=yqBI7Ut)N2UZm(O5W%6)|7ct@=HiDhJ?gSKsmDnr-GmGqv0!>7m={6U1C8+ z1s%`~G!#Tpvkt}1>1>t4!Cm?m4~JarY77ae8-H@FXw(H-W!h1b?QQ$7xK=>N2{?o3 zRM?AB&VI)dD<4Yd=%Q0xe6mxEIyE8+Was=DTUHpkesnO5>dw73?HWPUjrf>A_Q z{^T(M0#{BZ#pM!A&YMs-ydY4d@|v>s5#-)se#r2Du(*)+x^5C*-uw&7BsQ}pWfFlD znYy$~D0f7}m+!u^Io1^9y$~H}%l(?)!yqz_#31eCny7bY^T<2<)e#?3r>2*sx6Lr= zmKYWB$9uHLT#zV1fd{*0YB5Lu5p{hcgAbsq@|LL2ULZeAd1lJihIyd=^*I9DiqoI2 zo6PSziN>sln*$q-)|S(?JA}WsC6DLdGgY4d7`fxogtKMUIC=0*VP%os>!onTweP4- z>{O+!Pw6@;iYKy7x1Zi@SVS1SjA#Z<;BJ|9v=g^S(?;)te!9wKd_2wOVfmf}e z3|`hjZ8W;C-hdmVuIG8}=pzs+KEz1FHCa_U9_-!tvv%6wSdk9473OkiBF_kqND z0QuzNkK0U*>Hp%Z0Y)fJ!TePAfLidoxonq~z2X5{&Gk1b(4agUX{tPQtyPKl3PpY3 ztTW^uUr4tBi~eamA?xG`YrG+W^m8mk4yJu?LOle`k0t`n9%p`f)ert$_gMeFEXH(3GMf}l2JNY^0J;HQ8 z_ffNq9sT69D$}K=n2+jezWuxLJxkgV(AU`Ge`P^3bbFWSn>b*KR3I{2ytjqS$PcRx zb@=Ww)LLt!vlrE>kauT2NNrJnI9F4}g`o3K54T$xYFsd4r_$?TPe?VR9qs%KwArOJ zEq*re-jU`CBG$BSP6Th(6S~%n%^CvLlt>+R&*LQ{Pf3h}Dz156hzfGo5sT;eoXT^L ze?)4TO8+kNc^SrZm-UBGzT2@T*mfbq3bXXQV@1`>9bV8qej_eNTcgE3|HN_oLrN&vpIGmY9MaskV3vP)8 z?ScYXZFL}9(+b!XA8F2iO{kK*$3uo%7iu%-yHyRmCq4>ha)|r1LJY+?JRsLMML3xxFUGX zg@DfrtjH#D(nk{i6;J&vsZS(ri6c}wcw6`*h>uN!g#&UfK5!T@Uy*=I)U%D99$e?~h+^VAzUn`G7l!3MErZ<`I_lf?QT`p}l8?GuJy|&@Lnx z03O!9GabK@XwG}01e3c$=np*s7`nZUwZYQ4vfxQp@&RK%qvO>0N}e8-n#IYqZya3Q z1AZ*zWeM1vXhXa1Fq$0C5DP=STgiMt&GvtB^}mGZ_}72xo?z#4@V zO$@O`${8tk^8-`-_jGr2u0JV@8u+wVQ(fw4qBeJ+TPO~G<=qJGOijrAi}Mnq1w^)Y znqQf*TleR3#edpeK~m_hcdPe(@M=P*%Aw7$Jm zBGm|3x7X+v=aDU#@lN{#2=!vg1Jo+JqI=JhN4`KRb4Sbo84Dv75mSa|}lW!c}ci0d`sVxkX1 zYHIXgIYn6+6Ag2o>TdkJ#)#y97sLG(iRONAEBSm>?99xg_p8YL^xaaI9YBCd zQpue)Pqa_xDAi!Up>F?YVso9y!*SlXNGINmy|i3xFd#5}+%%VWM%q@p=8Y??%@!` zx|4HJ{k3&5f_tfeF0k#8v~X4GN+cyc7SRIll21TxO@AD5E|IeIs)Gskl&9 z6dxUBuMDZ^CbC7qQ7)TZls?Vs&5nNBtg#(;8X&tR_XhmH5&Oyi098Sa#C+*x2R^(V3C1 z5Xm725ga_nz$U!Ol|>#u?W`%!vQ}frS8LK5Ta4c%k=*O_b75{7)RNtvEAjk#I;4R)}Outq-a=#gxh=o0#D2O>ps zhOaT3Cv(#y)M=00(u`F`fWjOVwiM+3RYVlubRMywQwzQh(Ne7&hO|gr>@Lne%)}Pa z#|Qz`;L7lnEl}~K6yam2CQ1PLCnX3ZZG;U{dOE>p`!!cQ43c6rZ=uX?pxIA6&2V7{Q1jhXcjw%N0osu=G-%sNkn`-;bKSxxs2U^( zhq&n&nD*<(Zm%{y0*nrFCswy$p~~p>^8%mVe1hKgJGXSWl+%0s73%UUA#6-`i4dl} zpe~xZNAV(T@{+tfHP!w*iMc+=aC5MT zW;Q(J5-nw?1;nG5 zTAqdk>4<&^G!bBCFCO*5{q()B`-T2h@g^#C);Coubb3OnFbQC!C}Kbx>6<}J7Zc4R zrmLncu6T-{7dqyHqBRO0wom*?QU(Ikh`AWJ z&}x7978tOyGG+8*djV>mR_L5O{S7d(n2CTLMgL$CKl>xk=7xdbNGCUYUTAcvr5Jk@Cu?rm1S^kpIV$#TWL|1i}?o%F>T<6iEZrC?fhkR+TxY zlaoqJGQCTdkrZH}C0XaWT>@J{c8gpUAKexD3D`S=ty;42rllmAC{wz)Xaqm&drAk5 zY6j$b3u7nq81II7O&r*D>_>WpjQkot__E{B{99cHlhqFv9nhUEgR7vRD|T~BkurE*I2y;z^Jw))oR z)L8AQHdak58V7Z9W+KqqC~FPQIq|&Gq3p_n5KdXY58?7yR=@0ES+d3qW5hqUwPw|G zjjw>%{S*+pU&he7&nzBiKfmYwl4xS@BPqMEK%;~hog-HzIhowW88-T6h}DKEp?6n6 z>m&4+_SB_c7kI47?SK1qa!{8)VdG4cbqI0ia_N6Xl__ggC@D_rI4DO~ZPnn+5|S=J2C_MCwET zZsyYq9#wfR2R+*%>2#d(h7b@3xnvB=*`&+A@vl% z^@gQnDR?O-2mxgz`l#=4!Y=|u+Y*8 zRq$}%4-lW?*NQ%r*nOYWAhIc=)z6F z76S!c*q#L-FuK(sm!OSQNFyRH+sji^nlzfO$5Lo$bEUJt?i4IxhB{Av=x=&TmelkBbTauo22Zru?ZyYW z3?H@!VoGg2QZE3Z&ZiOOVw7}aDcAO7Rpm^TjCGS;$Laa z;C4x9GjP@PmbXF0(?UNOcYi6%I7-Dey|0gUwkcLj%}(tvi8XDdr7Lb0rE6{9g)~F0zcpPEIkL_ilK!)lnF#piGTPcGI#vvv zgx61C?z?By%84c;ZcZJ%x6a&(Qe8T}U*qA@wGs#smamn;hi#7h5+?uG1LN1J(i>Bq zny(`|S+7+ecyVpJ*%m3HjwzC@nF0HhF26co_(KePRuTyO`GjtBHak!EJH#DSJUJYk97}}; zZ~sZe5t}(~G02sz&X-z-X8#N%3Z1wSQZ1HtKnG-mn*B!tw=?8#zNJYReTxu3JiRHM zaQV0)ECwc)MT8O=~rhRWpdepxaBr%i=eS} zP>`7kRq$-gfAG)i_#G?p{33CX@**kW=n_E9 zifoo2ADKsBSsk}boMWcT5E`3yBukxU%nCgHSp@_56_IR0AGjdh0{*=oUrz9iZ+!GQ zz`rNq^DD}S%s`gMIjhkhG2m&+|4WH5lXG*8hIX)A>CS!&=Arm=F>c;0?J-g;vO}zj zxm7gEO97yyOLqU{?Ig^)TJ&Ncf^}%?b>AU8Z3gTT!+dtumrCB(Md2@e=FECNEQ-d? z_hZ)5t$v5x$uNd~W2n16FC6oYe|;l^)q$}2A6vEk^;(+b|Btorj!U}z{>Hl7J1d*o zHZ!%Q=5A``N^#_*GRqNe+=$&xwKOxeT$HAWi`xNi(8?@LNkvE;n5HC%<^U4}o(rve ze7nET^ZUJC&ws@WK3<=5o$H))o$-D*`M>FO4T;oK8m)KhG+y|-u*zVz1*c@FE$bvQ zgMTg4Rc{FdX?u?txJ}#xdAi{zP8F(mmeF9AUI4&}q0s-9l>%nkwbRsP;iE|9^PSKW zc%}X4ns4l1e{b?Bp*STphp0v?Hvjw{bC5#II~f5OS0+qqpB}pyZ5^GLM3jZs|=qO zb#eb${Ac&)N4ukgya>jZ-``P@Dcs-I|siw99(b*lhqA7u0S5>$^hCxB6v|F0!9A3+3RfdXKSX+n)u9(y^@ zs`CWZ=Q4H){8S^`JITRvs_W&QeH>UPRPfUXApo()OW{^<6Ad4*#Yp~W#81j?ap7$J z;^fDi2~fWvPniiSm-qA>4*D0mdBCo5lzn#oi^xSMbE(WtCY!b44Kc1E9Qg@Xsm>2M zU`T^Ps9?TrbX|S1m`MLwMgff53T7v|jvV(DoP``G{zdINN4%|TuhXCDskl_n?1YP@ z*-fKafCJ^m{kmSk|F#5BYudLkXIoT1^&5-%&-hHl2S1nZx+R>GJPmzJh-{n}p85Fd>InABkx9-MSa z+mTu2)H!|mri)DruUe%sWextkhvzRf5(lodZl+_2F3b+=I;Ky9wQkRC<=GlJ6Rj+h zPd+t8T6!1;+rCo6h@yVtuMPD3yICBxW@Lw*MXm){Ht7;K1alJ z&)H#@+v2XlUur}Z!}1o7#`&T2Y`@+T*UJi+aErPlsXhd>uKIpmga%|7F?Wn#)^paC zmod(W7fU3`8byAT0~NWyFrv1R+dGw0wsj%TU({ypx0T?p==ut@qJ$UaJwkqcU7qy* znLuQc!*1QwCn(3=N{$1LG1w27nKs&9`x+lRzs(0m;|khZ7yMPmUYChkBFQSMEts#<|r5nVgi`4|B48O=>HcICWPK+u4|3OkxaUy0NZF|mT*MtJS{00 za8!{+gN>{F9Ph9F$(*4}$a;%Wu>{md(?6(>{z;1YzoXoy#-s23x)CGtRl2K^h~HFU zH|&2iZ#jEryAq>4Z=bu}I_T$fjg{Mit9UrYpN=&2hVcq;st@jka-yJrOF#U@v|V+? z=YI{wBsm7mUBBQJ68TF~PoAHTX-Lp313(_ZaMK~$-N?(frICilU6OSAD2az@1kIX1 z>5Fnr?J_4PLmj5hKtA46;MF<*eTCaXzqNGq*o~$-e7hUdtd*7WPkf}o%VS~6Q2xy7 z!*`9&P}EMIRUf)_eU}2dLU3f0gv$=`jz_0D_g^i8$#bEx# zrK0uU^^kBW8J~RnSkhl}_n9<-blt!2+32#-U9XMg>}R}Lkw$6jGutM)T(r1Hzk4{l zWVM8Z?EsPIjZF9&aD4HQGW5CV-5I_z&Snt=nQRWTpQ~|6W-fp5_ZI7#;O~xobLCrn zpDyks!a{Kt4$B(m7Z~G)o;1>40^E%twM3pk#Y22zOo!$o^~X|O9T@sdFy!m&Uswgx zuiRQlUT30ELe##`I%yKzpu0ZD-1T$LsFd~k7bQw7MiN{HZ%&=U^l)*F!HPIB2xH3JfIJ))sC!V7rR!@TbEh<(ir@1n! z$bet4Q&17T9`eK+Bw-rzi4iiwUhKEo}XK>{^|ro_mgDNG=`AX@6q{{&su~Afb{x2`t4hZeBD@ zezOabEKjtnydihAj7?+qki$@p#{j5dJmE|uKAJE-|V={Ltr`2(EB zR?bxl!zu~czYu#C>zo!K8G3D$_2Y71giW+&bjFz60YDZdVau1#{|cq6&lV&;-qtII zz~@k$);C4^Jl%%Z1x&uAsfxA{*N@yi)?GFSN(T@D@GdZWpz-bM*yNAKP9Hn_vQ~UP zy|G3+8?gPb`=40jwK}ovn-)ncC3!0KA7JHKm zyAl9qclO0-C|Bxhlt{knhr=gs8n1M)A*Z4Iu~p*$0n> z{R3@N(pYJ5=ePB%TU{;~D=?a|_p*#J#qNM+6clVb-_6G%K}Gf?AlrMJ(?2kI5%Tps zeGzumf^db4fMOhABfh;!+vx+OfA||?ALO9+=8Moj9FMvK@woV+ zS18-Np-UWBd`~dNGojzw*VkH9jS~UA9mL@RDP8xRaA~o!l+pWNsYU^e0^O;)?43n` zroNC#N@#)PU6;?aKWbi)w5Jz($u-$5)(~9vaOoD?Mz`F03#sVsNSdQm{e=YB%ea>R z?ZbTaSC1Tgw6%e0t6${iWBSQ%E7rN4KF5!}; z3}vwvS|zUwHCI0CN;U5U^YEtdg&__u=Jxxd3Jg2G@Xa9?>V?j2#MO|u*T4wFKS_Yc zp6dcY91Z0ei>AQyi4n6vBF#uPLnfw$mgU!==xIoca%-R^w7SmQ6CCsT_oPIraJAM= z$?&LQNciVV2bR`+n%y@*g?ravK71+lmT3#-ce8fJ003Asx`j`B`0|-~)RztEr#oBw z*R5|)Cj@SS&xg=~5w{Llp#?a9|Ej|RCgBq*fJr#;xE#Co&R%5x{@$|Z6hAAA{o88t z6|cE!cp4l@^tra}t2Y9^#xuih!$!yxtll`*pmRe5pwrH=KwgvDAeqDHC7D4k1s?sB zbxG&tovg2~FKGJiPrRl*jaa|pi`#Q`olKa?j`J3mS~UTG2LBSXZCbT_!+Oa5C>~2i z!fp%+FUlbwsM$G()8kTOTsv8Q_RC2q;Pow*(nU5FD zq!D(;$Ay<>C@SA4Ly?m=AqTGlGeTSme{<|7C9r7A0?6I^GxLm+aqa6I9>I#Uw08jx zg#G2KGZAP21d@beaQ#Drl~Uaf_T9YJ^mtzQlk1-T2qi6G;}9FSVj$r!dS`Fh0CWn> zC~0pFm++z6TS}cJF|!wcIcY+}6hc4OcXGK&p$<6(xcBJ@*Xk&G_NGgaOZz|OdR~Cl zf#5GM#{pPCMbi3Acwp-0ulbvM+kKQl6>aeZLV>;1Gfksz`C^FX z!)*S1?x}UzQhCL{ypDl{t^VAtJibsL^Je1G4Qosz^s|D_0!I3?g+1zfl1{YWWjJNO z%jBNH^c&(<%)f-ptn-4AS3(Xt{F4y9Y-70Vxa;@yD=}haFi2&yU)%L=9HIT?YUiZb z0b1`h-Yz6C85fVf8>6L&j8@;qDTnjb?-?)pmJwZ)6(-p*ToJV zw#RZF+ubixUTQuR@}OeV1>J{o#XG{eFP#{|`Da`qkUxxuu5Wn0LNZA6Lv0`cSL_z% z1kLB>S()Kc*6T;hw39b=aUq3|eT6wbcA?!lKHiOY_Fp>TPFq005PT7!vROQtP*c;|wY;V*doj)l zw!jY<3uks8cn(0dnZ>bsT0i-n5(M%!L3`V-=`GL1RPOiOWv$tmV|-%2Thw)f_^7M0 zk4E}zZ6I%VaB%xh&j$1!-pVk zyV}!Gt2v zoTzOH3*Hhw=8JkU?}z$Z~_DbK;ysAEo2QXt`qtDhr51zP`_i{M=8kbntv?DP*bv)w%%89 zVWp3opX^`tfhu(1g^#Q@^&LP$DY~xXvZrv~pboF#Tl)~`W`AOHxE}+A?0<{>T@C|c z@r%mBsbj_IS+z%yt*_#ultAS>ht!yce90}#Ry{hDr?f&-9T{UrOiNg1p}B2G_?S5w zBYK`g0v_@rJ_|(1hyN$Awtg4?25Y;e!cvm&4gTx?M-F8STL*M+>@m3eZ2!QkNE01Q zt<4_I6%HlDL+jE%6xWpCqSCgsZO6-I)StJl$?cP{Zl9A0MQUy99I$x~TC`4m8~`m^ zGXHCaZ;P=@d8Qi7SBC9*=yATq2Xiz1m`#nYTO1zQPIf(wGL2m3H{gCQKB)Wlqp%_y zXk;sDg?? zXoc>I-ryjIIM)LdlhkWPBjpMcsu;fAL)s60z)SY| z5n`-o5gTM9f2K~jUk+qTIspC0sMciO$R{0vXhi0Ntd^iWKcgP7){rG^9I#R_0hF(( zx9Xp*T_8CNmPGU-;ly|st0#d-C#BQRBqZ;qBx{VeR)Zc}5$H@aPWB&(&*;EQ$@ zLt(xpHI$dzGYS0ZwR(#XH0oSAVyrM;vHETIMS(I<)h1q@lh;44O?RleAv9$1bq^G? zs$W8eJ4Z@;&fyH3=3muxol{ouDF)F;_=R2EoM!s6P9COLSNt*u%qI9Rl@ZdGtKiSr z9DQwgMqY{8aY1?T3}*Ugqj$fPqCs#lvCzJMCFHKl5x_bFxP%U^tlAJ8_5YYZ$dhyp z{+d$E;V+T7Y#(ZV*|i0dztmK>I$l;i-e$r6w^(E2h1u;MN|Z%|UC69IHF{zeb)UsKhV!rQo2nH@UH zIWAcimN#8V^$> zt>Vd}n$@3Fz@NT;_P?;rtObVVGXlfWqIAaEs$(AJTHbB#+gdY;4oM>;XZ;u!LRa(?jE-m^mb1U7@bgR{$4lum)l%U*z|_QF zBLbp#)smdNl#5GCm>UWWutu+l1tpQ{kRQBV{}{X~M}PbPS=G#c;x_>O5V2IzQF(un zQEfV*?A^Mi?2qIM(;!B^!7^x)ne&elH95VX>h~Z`KqaX}8-rRS#Dy)H|Ay3D{7K^d z{WxNXYWV6E`9g-UJvAd_cV=MaU8J2O2ibAF2~I-YOg5KI}**yu8)H zw;jCF1Mm{`$1KUue%5@VqU|Qeuui4+M&R1LF)C{mDe-~FnybMm+Yfe{KfC)I_lvgF zaUy?VJ?Sz9-GMl|uOHPk$jZWD$N2{Kmg2qemhA*^x}KS1{roALta}_nxvGE4eQ$k! z)Cj9_wryZz@m}y(!OeCnGdHYpBO};ytuST;&YqbEJxi0O60U*SY@Ht_<{^}^4 zD{UtL82mOvx~U5_dZ9}x>9b3%f)3YSGd%ZqLyP;5{g6(yETyoKCw_0AF14)S)FpP? zgmSU`G*OZJbqO3phnP(>Nfx3^i2Jr$$lopjx{xq?pHOg}c(&lI8DLS8Ohu+2DgS#X zKH1_I7Wc*_b=gCEIPymgkKZZ*FiKw)1|-Ras7}>O?6xVT%{9g}b$@tgy3N(#=vJnQ z{hJVx1o~T4iw;Hd0F@|OMSx1-QN1ptu5Q&u)Wd^=b8L?{1e*}EuZM%-E1$q*8%l!qr&KE&Qno*cTknO@3phn3>2$KQ_%DtSaIv7BeiE~Zs@ zm>k-~4S)$EFo*RU-%Mu^q&bS;*G_B2nb_@3PfrM7^JbI>Tu!puM}-&_xh-|1iD{Vn zRtuSvH)61A`{33BO=B2&kyltp+;Kh^6^zg^Qqpni;5D85sc3Vab?DH61{= z?&*2hx|!e7OIl4^$FZ%J;RNc>XZ2~f?9HI>rkKt&l67@>Dx!8W%J!PuNftw}3pMUq zO&D?6<}GsJ=9oOc+NXft=|akXEM`jURo<80XX`6Rbv<mplF`k3gX^p zUh`;WT}SoRqp9z|w7X3Ayz?$`Y8Xp3SDZ?rtAXI=q&?I=J!~s>8`RHE zvEZJgZdybZj~g;ws*5?3mV7QxEWC%BbggdD?3-y}Sqdn}ccLS6Zk0}_t|v^>SBX;- znZC-R4BX7bYbmg1B3N^rsbAJML!@El!kRHY;k&HPt3&gqny#)hC92gKM5%+w&&dGhZ}+2|1Lz55hF->X{nweD?8KWO>9(^z;Z>Xae2)Tvt#}kv`CFThn(;8LBJW)NKn^tB=wHfS< z>|?{Gzd>i-#R!Wr;(9SxJR`&`$;?YcO^CCUlOwGX-e)%&G_GgSCZ=!8*974S-_!n5 zFut8=7hX6Rzf3L%_P zF;bdtJ9mhs5Z-*3I5SkgCno-Hx(1a;wjYE7~)JNjF(HAA*CHC&A)z_ zH6E*(jFW{U4wWx}q|no}*YqYmebDzlgftSPgK%MpGhxS4D9<8!*vzoQ3esm0;|l1= z&G|G+y={r91>V$#^Xx&x*q1V&P>f&XZ_njcdmd6f*hA3RJR{SW)VGe?e>Q~OGkewv zM*4WhBR3eC_UbAHQKu3lmJ5Ok8&6YwiOno}XQ~o!Ya{i2Hj14JJGk|N3?@cx)B0zR z++xJ9$l}K`;?XT|yU9j6CQbAPEgngXDSaoO*Vn1XFITsF_Mjs4WwWSt)eO#*t4Qh| zWKTnmy}Irw9)?1?_Di%c9yZ1?RX0P6ieN`(-$pfrJN2CIsojnnXdKU$KJN=-B$_%1 zhxNqR#KtS+s}|An_3`~$nL6{&RAZ&aHVA71f7%8punnC2kPIN_v`>dOweL+$m~_5B zp)ZUi&eVu~7kS{SyF0Fs3lA9UBWv&JPO09H5Kp=juOEA=937#yDLw*&b)LQrB^`*n z5Q}5y#+07R;{`qiuP`GJ4~V~8l$zIvg377a=*W;LY67^7IO_cw%B7Rnl^sPMluN*V zyST;Rm*j*#&e2JDpP4OfqjdOvys4G5`F?SXPMn$Ev~Npv{)$&AJAH~fJdVXgs~r@L zt8QK{TIOxsQqX?Q&w+7r65)CDxpK3#-QJ8Mlf5tbF3u(5Dew|+v4W3QrUs23O3~U0 z;_tcQR_KF;ZFPrQ_NMU9WmX)3HPKYu(P6AaBza1hk9nVKbbdg587NVU?)m|?zErru z#}S)Lia_M9=m&fQx$o_-&!v{%4~@hMNy3#BQ*i-fx7U5gY{$*bOSAv{5p$ zzOlZq9Q}Z`=&Hw_aSx^(Umi!g@paB$l(^$JInYX1BfGaq?C~2aN9dGVZbV3Wk_G6bhWn{$<&`t_S$T`5uWoKaTlgb z^o1{(E(aa9N@J29G|0agkH{8DTS=Bwi0PRJw6%Oy$TM*KtWk|S_N2n5v z&Liu|+h(mcoG&$oz4(YJ@&aTmkKDtpAAG2w=BfBw3Ut@_7Sh)(cjGh-V%_&V6ZvnzFB&yM3!Phpu^cf3s3@)K9k- zsNMh*YT@!XFZQ%Or8jFb(;aeYZk`fVU$%o+hcA^b9aISw$26IoROM8~%$>ytOg%XtkDQ%WS+3o5mCKSUfoI+kYRt^+2Ama+?ix}r3o4y8QX6+=H=now8(VmTV zho&)p5x+hE8Hnrwfe3x}@4VO`-fGnoUPlM9C}CqVqBI%3=`FgR$7+2QV})e!?*0_Z|S9i zV{F_b=Ave>PJ(wb;;hfR$@!;wdf+yq@oLZ7@SpBJ5ri>rvo{f-fFbVkCPVLh+Ztq7 zWP!EU$8tA7HVapHv1hKW^BmXF`!m+Cxxvxo>I@JKd{lrrQvqK^p$>BVvMJnszY&AQ|hbaLke_?7#-rR^=8W6 zuRDJF^0mO1&)zmp-bZ?~{QD>a4z02Et$$x?FZ~_zQ{&Zjo@2Mr{FcOr4H)9^vb)l{ zw9Oq1!l*`chJ9c6K290v%>?&=5_hg+i6q9BY!bFJ0-6|b&Op{sG34k|AY3^EsvRWU zeb>}wl{mBI!f3s3GU(0mxDYaJr984gwTJ{>TU_nBLO2(&`{(yW`D+883@!u55L(`> zapj?RPOi@2!~iYpe;2$A;syVrE4jtIKz-k(H`bR(8nr`jPGE?8a9d4g?Cq}V6_ZHA zAVhByF_M41CjMz19s|#4#RK}fxKJ-TuM`5I>5seyM&Rirc;Y=rHDy#EQf{LO#KG{Y zm`=P#8a2}*{>OFgzX}kPg6q6=SzCY$xI&ei_C-xN&!3 z*njv$g0Mm4tT$@o{-@{tWruLHRuSyjJ+;L@?17ik-0jekIG!w;Y`i;YODw{Lsls@p zDavCWPyF-#H2*69a_Ye%|2*<{-9Ou21>2oR)!h;Qeb&7H)>6K+(^KFRVR{-5M?-(c zV1M4v{vQXt@SJc3R-!b-@03Jjtyj_4?03APZo;ZW{=;m7(ATYV>k3I+8He)&^> z%1ZtFtV$+yHyg>E99{+ATr;3^(k_oA<&pY+pPHbNF?C8MY+69%I?I1=2tiFm$q8$i ztK?%$)+^|Bx^~Vu!)6wtdHv?}u$2G(dK>j-V$p$PTA~_QXvZ2+q{>XUitczFX{gRr zbeFuvWH0C8lSChASf>>snFBNLO#JV!@FaT1p~1fgbo`#YFMMG;Pi00&g*d1~64urCOx*-LAri-N$u?43g>0&!fQwJ%Tn(W; z=d}HIUvb_bE&1=q;^SA(aJ%b=BjdiO$$b~7%o;-H`cHl8VJ+%a-;uy|%P_`>DyT6B z*f>!}a?SYO*9POcJI;LZ{2mXkYvLg7#$re6Xye&(1#wadQVa917IV&J|L39n7&^!$ z>n^Ct2cgS3;nx^3B`OB+sK7$o)rHRz)ty87a*5MgsDczz;&+UA_=x9|PzQQg8dOK` zWY@{AsFuPdJ4~c8K0mFCA6B=9#MIS^_sfYcaQVv={L`-8%osxSE!uq_Zsnd_9_kuPsof`45bv60wV2t=Ke z=5e>{lN98J{Jq~N%ou_ z^2J_F?N`Sh7VeX{x!D|+kQ)D(gU3V#Q>gC>y{)n@h~CTN06Bfv zQT0<0NDODq_z4Y_A3@)xMWcYy3hc*9@C%Q5;XiEGAGh>nz^DHs`ON*;#Oc7&GKA!X z!CZsaZzRJdkUzS2J!k{ndlHND=>*tT z&HGcBe`?C?<^o9$U3>Qspn!QbbrnnEXb8lE{=XCBB`uA0DKY%Ff78ex8(cdUP%VCK zDwW7v%zfze^uatxZ&vxHd62*xF^^U?1eMPPrVF$J0rD(RYeeAfT5oBH{W99l**!g4 z0Y3dzfBr>2b3gtMefsWj9um%xy`1v);yOuKSGy9-Q1mYLSq^$tzeN10c)h`0;MQFa z3hG~u;7~jTh(|mSZ!Ks_{!^Gg58UkLN+hX%%3~O)tovZvooXX}ow8C8kgCitsI6oO zM-pjYffrqE!H&y2);5c|D{#hHxHc zvXSF)=uktA4FAthrc&%iSNV>nFu$b1$%h;jrw$A|Pmb5NFC34}Earc_r#typT=p`g zto+!(p#$;l343}q2z%R=4nIv!4p#uHujtSpeWJe5`0U9*0CvY%afUli)sPRi8xbK@VS+tWke#jdRW6c*S~g z*UE^tvU|x zeW>El+3|~Lnt||5dsmPlWt<&^Ekr!mIMkD;UB} zsL%3zhem$s)g~g%*UPejr~nJ*ik>vOx28=ouJl{*fsKTx>owA1gF{xSZ}0+rK4#!bNh7N6kHo zDW+)^4>j-v&joLhp!cv_^ki4pqx$j1)4kPOopSplg5^pkLx`4wNTSj3 zjrWVtodv_yVzxZlLq{d}$Ktq}I={c)Crj^H*yq^u(C1;oKbM#Ari}R7y|L+(Z^M>b ze1#WEapyZmc~x&VgplnxsI^={Gc62$Dm9cXzwoo!0m-t;-zJMBf}@2k_w@*3+-%I& zX!Z3yCEK`bc{HQ(5J597d+LN85uRauWkADjyp~B&3>g)theg%rmwLurVDKuqVR@X8 zzQy{Afi59i7C#*EcB|Jtztf}kubs7f%&I24n*=#sXrx!D1et_@t}>j_r;r)E_#7pm zAwTtOZk*@zQA>fAR7^Ux7NqS`{6BA{6Zd4<3l4oKA3!cr&KNvPZdws^Am*TnuS)$V zR^UzqvAXtHGAkVD^mRSRy@45vDTtkieDs@KVVGQjorI0bAa zotmTTNy$3ku!^gR9O0TbHXx8I!M^FJL}Vk`N&HXrI!e6@*|?^~ryuQ(b1aRZL`=fn z1z(vwGB*fo-9p2eeF-9~;z#pmN|t0y{Bccu#lqtuR*m_DSu!-iv+A)>?!a{#bWhR! zRqz=a0q^5j2-l2*v+yt$Omgr7;T+6hppoll~fF4PFP%iA#9&t{;)+|wD#eCvRWxND| zI?zr^a(_UNS7H$o#uFU7V}8TL`B%un4D&dv*zU!X6;xuDqp6VHY2Xk*fejQAM?d%` zj89>O&swR*NAVA^$JSemZY5V{^CM$#7ZvD5Vt3=M-aWC?N-}#nZe&?vYnA&4G;9A= zC8u?U@lzYthNqA+CsNM7?%J!XPY>3Rx524JBCpygy1RBlaSntw>bbS%(jv2$P@Bhh zLnga6cT8ozl&`n)yrLhgc_C&8t@jZ^??AIcM*^0@Nc%%1xEp%6Q*HmXAoN|m+|uV_bs8xJT!Sz%u` z9($SaPaiD_i{PCJcBt)1R#-Iw0w_1E!b(ADo5IRnR+o;Xe~3f7^x?_lC}(%$j1JeD&HuoZbD6G_pJ*caH2 zj%}(%J+1CK?JL#Y8$P9zOrYq)$F$nt>d?|AHgKzU?-YCzp(`{nE02|Gc8buw{tkjs zb86Tw4w8+Q9%|%=At(W$Ba~^osFQHmrjv(NCNnr=nf9J0i~y>SG*atHb{rQWW@+Ld zywe>V#X5Z@bJj3RWDSw|0;dnYIZA>waVcUvKm~5!q_nmpOfeUwvldEouuJo8Y4|{b z23@>TV(N8{Lr5m>ZooAzHvfmI#dMce!)EZ0K?crAl4CwyX=6AYmFDJX3LR!7> z{SDJwyFY%8A^V}_P-y`T~WM61}FN#?hD{$X0_{jE@%_>N= zrCRhmPPzNtFw;uekRjA`wm}5p^b9lzW}AdxK4p9Gm@cePn$KE0*7`(CTqCxk^$QnA z`)W}gk=IGx0<0!ovyR&eBdd-F(yO5+CU6Z;OJF;*_BLinC1v3=@nksJ(5tdQqdr?* zTyBLaXrb#q7=M+)XPPlA8b6sEU?@nkt}%X3MI2jkQ&N7WWKH*U3sB>!h00J!#=9A%t0n0P=0HRc^m^RUM#ZUJ0x&q_@w|p}s7U?;jqyVo{ zJ{tFF_&oALv_d}S`e4h`;iLOJ#1#%mrrzc`tgN8FiKd2?g=;BVW{ zp|rA2M|+#VDf*+2@|~MfGi*(+_)V}*U@hi=B5T|+Ul9xS9b}#4 zl+Y+~{LKY6X!ZqIsZy-JlOJ5okaMHj#1vV5wG->B>YW^UBZPd(NypO=X>Lh2Ovl1$ z7F!%Ur|Bd3`n55GNrHj=5rz7V8qTxZKu=c+>fhYq*g0tKQAH7*k!*tBa)Y$COnYZn zMnXb;mu7i0y(kkIkwlA*&Q~)0Fn}-ggV$zK*nzBvipKfULSc;p+gij(P-@SA$D{L+ zs3MAgb9)oN>y5dV1C7&uPL!$koH5eM|MoSDW$13Gj|`-wU9JkEV7hysMzp_?p}M@;qTOq&D{d5c1H3@W;Shv)f`0&;dd`6~LT=UC zorEnSblrbrq2UemU&(NikaFuuHY`@HMc40#Sa@0;^8G6ZfyT~^U5)#UE9jg>e(KYTCGMP)dVxnQdv=u#E=kER zn7+tQCfbYNt5UmWR58Qo3~Ftd4TS^8W?=sKr{sMai4P@`$CEZy6L)bnkjXu=rU$ww zRUGWR6u6p>83nLTHnbf_^t_m%iy;Rj(3{IEY|w&~Ll5{_HDThcVGZ=RY+HvRw&Avr zuYzwb^T(0Z_N-#}xyxK9eX|BpDQH5qa#E);%^Op$PR)OBlT4>QAXJxVQ}dwh+_-vc zyT+foeJIoC2nfBQjvw&mn65os2WugL+UU#A!FtX%hpZi=u}X~$iAE!h^#va*>w~$A zS?nsm>eC*r>`LFY!sJC)h9rhY1D7~#%`2q}PSZassV}x8+d+X&=P>c80Pk&EhZaS2 z_Jiwrlk_n=2SFi4gxB|o^x-I1?-D%``cgEHu`1h6-a_nFP)IN_#dbD`zKDHe;qvPC z!O|b{%Ym%zMaP^A;0*H~bGbr^DRJkaMsEG)POvtPZ#$rbDsVbKaP5P`B}~;*_e_&= z!$AUaWMLn*q`zZ}kPS_2>KL#15dVyJ*{$Op#ot{cBt5}W4wf;})-%!ZA%4B@fN^Sa zY zR6EHXUDMG=kJ&@(rrfrOn%q+EZ-CU#KN6CTyN<7F;kgNA+gk`Xl=TM&wM))nhEH)6 z4Gy&v+Yvuxi7)XqY;^KSt!+Y)#@L(sFusaXrIqB|+^)I}C;b=LWsq z1w?3wG^P<(;$zNQ*frR(+$DD5I!9E>m2OYsSV9Mm8aATYPtD6W`O17!P#LtvLPs8# z!_k#D%TzgmJbf{r;_M4LLVcYF5l?N^4&%FRo71wL+v{oBf|99FHDBjC`yeRu+SLN( zl6vV3j$^B#1R*NN z0ZV=cj4o3b)80x{Qi>@@(EB6sERh~l`lVg-{1j-VGKL=CL&0~+8s@3yz!Zvg2>$4F z9eFdxhZ2h(%Get!(v`h8hpwEjlQnyLQ;@3)$S-)nGOSj~WpuR>b<#z(4_X4hsX|IB1!u48}Z&*$pDWvvH5K5|Btht%S zo-q&Uc*21(ml4-*pOa9a23^o(ABPRec#owAP&6mHL>g{E2rXl{lRlyZv>e!aEXV=H0Za)0#D!R{wL)Gq+yUD@=fb_Hao8a z1#7GCep*>$5_E`mzMJG8cC}#5Q#-01KTt?@s#+j%L_&kaDHrEGb8^cQ=<#bCdMHhn z-RPv6VY5Y4mRc6Y=_?Ib#}<%Wuyc%?b3#JRELCm*w#Pe_g(vt`PY$MO1;Ec#o1lHG zDai>U2NQRSuxos#nwjsR(y|3<3DdC_T2G#_O9(s~bMU*Dp%0Q~y0Sg2%(0%#k$9XN zrPigTnbFadC7<7zqsxHbQSxkPJ(MCyv9jaAC8iKdYiT7O1x`n~12~-yAKBg%Izof* zS98`*&nz`3tUYBOwfWnhQOW!ST>}JD9|H`1r(qekD zL{{z46&`WU0TC!F+IEqF@M?FA62cGMn1wdxzD2<+a|qgpoQF0xBTaHMk#@GSzHCa$ zm8R2nIx_j3JbKZa$4X48W~^hqA?;dIzrdgUgqu6~yN%~LxSXJBZ#^a9p~SzJPj)%l zskgX;8|IMq)U6Ohw(odo$SJkeyJ#tp$rU~#z3Z-h&l6Oy8@|@DzQ%i@WasvXO^jI>!)B9? z7FyzR!z#Hc*E+g1^3|;(M7BV=%@U^wReOCAJDBNdjpcm1#!a&()^jKr>(+g4k~TRT zSxFLR(k1rAzBIRZq+`g#DPCGBvETtk4KP+18<*e*mK;0D2}mrM3KsIzjqpSr%&g!M zrG!Ivtgr79csZk57^-KZG?cpawFcQ0cqexr(j%45+cP+xwbfwT)r|vp@p*e@t!s5O zlb%kA(0tL#T!qPl;hV^60{=|bL_rwIE&8Y?@lVl%tAx1?0=<_0ps{SwI|&MsF!nhr zc;xw>R(g>cL)Mpb@&=Y{$RcFq%|uvqnJsEjUzs6QN_=db-)8+s%OOgUzBCM4pd*!i z>x>2sI%<+Tzcix7-8_+u#f_QY)3(0G`6L~*O>p<9cyv*!!RW_=)N@zdNPMQ)YJcBkLfK-gte-I>6`!>&+F4AAmZ zo=>8JM^iU;Gq0x7<@V~+pi1sJV$PIKtZpESaHlu3p0j_XMbu4Ss)kG2hr@b?eYAV|rd>oz>K8LTMsuV$}l1WrR(TL1i zB2yR&f*^!R8HB_Dl8P80fJ2p1f(QhV5I_l|5F$f@i~&WFFhn4OhOvqv5DZ`-Ac;x7 z8?CDDe%-5Ae`~$ffAs3}S60>?_C9Byd-vYwx6id~%!VHJ_K)K4k*ge6raYdEb~v)zVP|zjEi~WyOrdZNv}w<%z(E?gzgfl@Ma;i=^$0wg60kzbS(h znl>=}dX6ybt1f^dpV_CpG3bT&su9C|FS z4MlR@Y2JVqjs~>3LhWdzro9vxw$6)a-56!BVpiiWG}jNM@QhjYKz!bXOSIwX_44DQ zadG|n9#I~9U`cO`i<-mSST-C%nz-k4Os*ZFB{t6!GzJ$fyjWRx7?T4!o)-zuS-43* z6fO|F3|dGdOR*YnCsS}DDIK;MgQo$l>rOQ*ME}|e8$_TbV`Wfim62qv4l2r3+K9Z_^ zqQpl_x9V-6Q{Vc~YVu?3FlS^W=?4d^_DV?>?ZT}s9j0dEkH6(Mh7r#NJIr&GqV$*W>RW_L{P%`q%+NUZq?0)8l zRXmZL+#|r}WrjjHkvSX${7|j=c^|ZNJckqFf}N;m%&$J%2_XyorkkL*G$0%{S8Vp1 zJ0{~s8buIz&#w_BN^%0?by4*bV1kt{5ofur&6d+Y zX-&#Y`vh+hn#cG3ynpAoo;X--ri~E+1$F?g0`p=cCOEj-wdUT9!VzBc;$QC;wYPXg zmWO^>Ov5aYY|dGn@C}J4o;+5U)*KL36KQ1C$YHxU`S%(C$~Ye=Px!jG+Rj zBJG*>aJpa1e;`(f?yE@0F}COUbBrNn*f3{tqyD{La6dK)Oi8>4WT=-iB)fzI zmW^7CIM6h@43IdBXAeu1v-71aC_ZxF2==6=rZ!^(mtvKflLDW(Qy3CPsb~*ZCH4L-Gx(*4iRCcJ~2)9>3Dku!Nw8 zbJ5--tGE%K9128{5Gq0GRJqLDwP0!CLd^siT^tuFeNva0M0*g!X*lCEUYIpC*U#!N z<*fnXkr~OKghX2uvS-PL|16x!c~v~i6q|7Lm?uMw8b1g1wO}!|Bobi|^?O2Q6g6HB zEOZE0bmw2d&4}$!4HwAihdA~Es4CLNPI_Lm_ZoK!*@8lsaOf3i{Tbe9!!c&u6)1bO z=zVGK^bzW4$oS28M*uQd_nNq|5c?TlNj5WNxJFyG8WCQ6sXb^!o=_7VT+Jl>(sS^c z|Gdshvfw;yy*t6b-1lf52J6^ z(S~9OE%k_#LF*^gX2uR$W{SZV^Kkeq^;+@ONkgq)w!K~W!T=pDpLwTzA4)qEF<+G; z{Prj0m~+6UQH%8>j9`=PyRJoLxi5$Fw?ZSSb|xP@X;60B-e{vQL=hl0$2FSpsolRud4-bj1(VoJ zU;ymO1^|A{GBpS*uk(e%$_iQ5PfhxC9h~VO{s6J0k;IYMeEKFj0$8xU9|Dk1ph#T) z51{1kwXZfF5pw_Lbz+GM_(Z`tX_#*6phO$i0dUOQGsJ%a|3>fs7peqcK^B(3nmX~S zSV6aKzxF_J?*9x(ow_+ux)t{sK_JaeN82-=!~GA` zEdV42-*j1l(*i(KLwvJZOt2E@X@SP}0efulfBRRcAjd|mo=<-Kle&+;Eq-;(8DO2F zhevm80$Ealg2tKFzu6rHkM)IjntY}Vs`&+x0e~`{yY~45|NbBK)&ByP>iGIE7VBT& zC^Mi6T3cxQH+A+mA9hsuxsm|XNC()nphsUR5V0>E`4z+#3HzD~oeoqtJ90L;Vgf?ZwKgzYnOH!K@D~EMkbKcq@wX zmgpbTM>pvLr~MBHN8SG~TtZYNxA@<><^&aVFcPL8Cy!6QjGmGx2AREdS{dVEGhNd<#50o+kArzbh`7aHoE()z$13@OfC{Q@;VA<*%8qS52ph4nh<3tpL?39c#xqKFU4= zJsW~?X|>CkLCsVfXlLAxbY!bpHJ%%|xQ)t{^Hll>WUvHmnhaHyRjydACxD*W&j&A@ ze&bhST#%peda~ccNA3EPTh>tQ5}u&5z?4~ad$1A^Cmv_MKFL2ev-Ccg7NBK-h)|iq z@w9IRAz~hP9duJCc@u47YjM50b#ADpP?#}!n2FhsRG<8g6tSKwu9^L2YF9sA>eM&J zBO3-Ac;7{+AFhxtdnQ>l?vCCL*w^)YIziY=nbXESkmB}uy$l^V?k=k{10ST<1i6$` z7xQ8~3USC?pzkegJ^3s{C@#6Wx;TqZ*3F?mguD`W{;Cm7cVwbG)1#iWHU&dgmE&^K zN83Y5Xa-7(X44ODayHoOvYYW&{{BuO0R6|7o%bKnY^#jZAX?^n<}V@ zBjm&){sgwrbA?0=kRhAAV(xw#)7d?@@*Dm|DzyN0Du%w2aRXR199(M=co!wlZhR)zDLzp0xY9&O zXdbW7qMIWzfB3%IE>4U(q<>R8FLm>z3;bqINLb`iqFbssgyi;IeB;ic9uENES%d#* z0bYK?v~AcG#?-lH@PV(&lD&niSjYM~njp-G*ol}cBFmXm@n~91D`@#O+@=-BS>56; z?eOVG+t!6)ou%q zmB)j}!U7R$(pnL`<|Lg4_knIH1vN%OS|(_D#ZmX%%od-PR&xtE$8K=lN|*8v6N?=9 zk_TKR|KST~`mIz@-Cx-QZ{Uu6CSb9<3IklYVv(ftW@$)y%}7;GKql49aa&e`t`B01 z3AVUV@57TVM@4VNHwdM5IV!IWyT3!z#3GJfJ81cco*xM!4$IB zZsaEFs6v&7X`MmZRCviQk5y%t{Tga*$IF56)zh(sLfHlk zcV<_kRv+-(eTUcC2l|MiXbF@C_*;kYn;x5=@{%l=|CD{E49pMmqy>U=FW@g!c5`T1 z@g+sKD_zmnYvD6GcC!K9Z|jXkX40U`@s_nGmcNRwg16YfVe#loC4n@n!t0CA&I+_S zI9Aov&J-{o1rJQ@5B|qS5{!Bx?@RAPuqrtrp$2>ljCfNv-jd|l1%!5RqcgC>5-$Dq z1j1P$XV>u`L1}XI^TWzvbl7jW!9u8hPq7gUbqgT$k6?cm_qHeP7ZK6#_8HWZifH+y z4fj6L5{zE8smdPLYd&I zy9BZ|DXFE201boeZlsc{xMC?4;S+zUrj;VuR;~x#JXs3W`=u6KnSZEU&jptG0`B(5 z`E^?2kwiBiq?D-I_lahRP(B8-taBzXWmgZ?xfXn?kcJm&la6H@GrJ{;33NYsz&f(B z^)Jc7O_V7oHgK^oIOJZ)-@@4}y&-d_l#}Br6jGHS41hDOL)=cpi-`e-!AP10x<#=~O=!WT>LA9SLS^L; zbP6R~`e0ej%>p>0j;k_sr5@`hEF^bTGAbt;TSfMcyn(a8j3iDW6mrtHCEI%?1s{;U zAKKdIXXC3+;di~piIoe@=b*GdYGjKq3Bc2eHY-dGuxfW+w(PDPuxGm2&VXU`Gb7C@ z8Knu~k%47Qo{oL>=WaYd!UwP8QOK+DmLC8qec5*s7bt%&g9uKAZxTM;0`7!AnT_b4a!23gw`DTfS-^SUJjS5Bkjy$_qJl z!WJdun9Q;vrT#laCXc`y{i>uo@u*HZrH(!NJ*dN~3j2O&8Zx23E)6eu>wEkz6qVc5 zlUoXSQy0ae#%UN>e}09Uk+V`VY72by4sFes>E{Ta$e^Cukd2?PF9z|O+GX51)L`?D z$pF?h8)*-r)LMj_+J%bxZjKedf#Wc(;r;nwcoO%O#vl0Z?L+I-I`-;F^wvBTP?Sg1 zKvq|uuA7gS5>$LUtas7@APCyvp5tuT_?0Jt}eK zr(Rz!xpY%y+U*e)p)GdI z4(N9m*SVps;btki(v7{93#+`t;!W+PoxS)wdg}sK9cb~q;Ox*ML-YeWQ+KfOWw}Od z#rh&7lqIVe@8(GT4{3M%FMW{oG$8gLuIL^wI(ur5LWv>$xV#QE-8>(%>tgD*ANZ5c zhrZ|`+iwST92*TfWAioxZSNT7lB!CK#fnnK!;|V5A`M{S;j!^y$Jub2xFpt-Jv8}G zkYy%**jYrzRBN4pq>2Z*jI#8liE0(BX2zTnwyvLBo*t9=i8c#16id1IDAo)l5>zW(cQb)aOX(%P_lkj!ae*WTnVxrivyClrbe>YEtmIX=&H*vvH68haSE zU!!s+ZM7#@WkBcJyYd|eSX)m>Or63con_Ug=j|@x{7w?3j%or8N_TGN^SNF+%EQir zhY6r1;50ZQyIXdkWe4%5TD--o6kWXbj@Xm3+J-K$T_D?IZ0{Xe5(S#(#*DPjOe9(u zie*?tJ8SoQU7WFYSS@ec2hv)Wx2uecjvZB4PSTFLYny zKzp4`IQlT6L-FzK|0s+zIh1*qbsPDOkM+KX;cQ%nL*1!3jyuKED~C0GbdS1mkEsW4+0NfV6czB&N_H3GLn1OXir0D!VnYy$QF}<8g=Ksy+x2Vz z%b{eJ!9aI-H0m1z?U5?-{M_h$y&e@}a8J8vL6lKvmzCw>zh`p1w7&)^K3LaEB}3QC zB8H1B7T;9Qr>j7*G1Z3J-KOBLSyu#8&{-Tb^;=-umVJN*53{Ea-uf2&14IR93qb6s zEU>+^-8pI=;0%fhU8p|eXddp*ebJrZCBcw}QYYYU?2-)BG#L{$RUTHIQASYqY z6Yyis`5O-ooWl;_J6m?V3YAc*amP*KcB&Y}yF~gFZCO6Z1tKKaa3nR6;+p78k+>X2 zFrJ-_Y6yfat=NaAiix@~@yAM8btM)_3w+19HeUXxUO=9x%j5!c8$9 z^3VB|j++T!*L>$ZadV>{P4I@@uj6d2dJ4HQ#NkgbV7bkZ5vy{jVrTcoB1Nqn=)wax zd4qCx7jt(d@9-Qy$}CkcpKt4`ce%{k$CCr8mXhwFPR(opQpy0Y7|4BnD^z}DzA1}b z*~?*3Dr|l#eC_=qZ5qKBqgNs-yy~r z9&zFa@NWrE4i-IV_9{0AzTc6!L8qpYLS5D!_AV{RpROudK0o`Jecf!#z5 zik*M-ANF2xG5aiu^=Cv z=Ci#6Fii!Yc7me+Iqd|pJRsGw1d_L)IR_xoP}Y+Tx^64Bxela&L2ojGHiMkA4sTk% zyPC8Nq_shhv^%$g9_o2-p8R@VK8XRw<+S}xWzbQVnynq$L-NiQMaJU(R#i|C-cLE% zzC?bDr?ATX%6g!V6HO|hbO*VTBEg+}9&WOl;+F8oeC;PmAy9euas;TY%k34)y#ukL z2+RtQZm_bFE9&xkMdKR-vDQ{Op@@W4q87H7>M^b;zsr@>zbdzp&hOCzs^JQ$fxSJ! z$Up!v8qX;?Dq*p)>p;cdE22#l!pDr=JYlxSOyA5D)Dg-`lxCnxOFxyHdgu5lPm2AU zSuaY=8Nni zYvuRe!wiXwu`&Szb6?d+K=QT#?O=u+Jcd3#p`V_jgKx9`eL=}iJZ+LPGw1K__>mt+ zEFlB0!IAKIX^WC>edMAthCjWw^s!Q6o%>Cgy0qm^9HqMQHrdC?jz@ROdX{_a@ zjty2Kwq~h)-7msGKHs0M>%I9|B5zR+H=7>;@B6Ug;}cz zu9SQOze6`KKU0T9%6+K5w_?ddz^U4z=Xf%gxwT4uFv*1pBpW7vVZofXR{=v_Q{8&D z+HfKLa*5tFf~^E{g#e=t*>H=hInt!kA+m^-d*_x$!8}Ifs|75>wWPcRM6Y;Ct&)fB z7e|-;HSVRc!wn)8RGCUO-p??5&{;P`+7gz#svHw#Wboclmxu}tl~S+tx5VbIse?{= zeHl{D=OHyG)XVR!UPQhdz3p|rC`)!Uk|H08Y0qzR!*`nXbbu~|8maLqii^`n@m*k` zJ}}J9M>OW%X2`o3F@g$Zg_tRa&0Zm%M0|_)9(>i{@Gmk?!j5H*ZEDj+S z3G{4G%~@pN3a~-oE`WNkcABvt}Uf>+!1={?SGY2t&|rLuF@3 zfN-5a4Y&kcDn9dhIK$CWzq2n+VD{IoDc$RK7Ky+;sncIk7=_2r&w_bQuwKN%eLCKv zT!e}Lv1V)ZJ`@s7roKS2sd5A>u_`+jxZl_vtHE3)UPC*3^YWY7%*O#PKhHX6H|ZOs zt`TglW3@uE0#DXrM1g1x8=6;h15IwMgTVn|8iV^(@;+5v(l-D7UeOz$qPq0UhCYr6f;yPdt%2GRS1NS8d~onttFw~ zDN?qkn!k)Ztts2IysYRhnDMd8HGTLaHEKZ)Dr`1mcbu1~+dP*}7g8gO8)q!1yBkKD zVfsY=hZ2IWVaC8U6>{4eR~-UMM!n6{R#@sRy_M}3(HrD9K4|659{tb@n8yo1ON9~u z)@xmD2s^F@m5{_?sn1@@$cy%CNom#Vic0AS8ryZU^XplJHK_ICpQbrWuimo5kubY$ zhp?vD{QKb~&$h%L4V2&d`S^^pT~@_)Wg#5^8hlW#z(OV~@BnIwq3|EuaUVB>%1t@s zKlKhk)G&ER83u^Ok26jn&czziqQdNF>F@P_ndQn?&rrzo6RpwrfQOK^Rrk?y&EP`Q z#=z)?4g79U)CH$spC~NVFVFD-{APBDp{iwDYgKt5Ik%2)QE2BFCJVIbVHR`r9}<_s z2h6cCO=y!t{6~)Wuks6grHS$7svyFI2Q7e=s6b*k_`~PM#^<0AgQ{6G!#D`f6EI{9 z1c^RB@`o8DbXL}E=q9m&bB6WV5@9uGJxgH^%4L=4V)<4!GcMX_1*`=ce~R029{et^ z-ZM;ozuZ=(u{ysZB~;HrO z<-CG7NE+e~!i5zuuvf*rp=jZ}poMoN8$)XH7+bOxX5(oMJl;fedYiASY*F!ij;bWp zz~e11SN5%jQjDESq>Sd2!P^X``ge*q1Q5VGhQN@t`hIu$SGgwxS2j)VL}WfL)3Zde zad}-&wQTpE(~B?L=8Xr3>PA#$IcO7aq+4fx3Ts=F1KZsl6Ae_lA89iJbzEhyF1QG2 zylR7$eW=nor5>?^QMLf6cbHX-TP~%H6ba9_yo{6tN*+`?(5R~^Z-LikagFI7WWr`6 zWc%!v4t2KPS@*0$GZ?+EyEp4JEB1h{k27GnRbH9NGui_12S}blOLo0pXX^Xm3WAu0 zMf$K8kY=*?u=|vvt&k?i-*UI*f|7%g^BhE=^<_vN=i%W{X=0IZ6gRQ$ExSIBS@J!Zn(!UfN=R?kHTID1wxHuYZtlVL>7;s>QxNqqwQ#Iak}7AA@fGCls)k2LJ#7 literal 54120 zcmeFZXH-+`_ca5&|l{gen$LL6P1&gkF-+5tJsq z1`>+&8afFO%Dd5Xer3G>_kOweez;@YeA)wa=h@G**IIMUIT!CWpDNRWS-~I>i1x|j z2ihPIc@qdk%1KQL{KQ+d(;fuk0zG-4sOy!qHbIl5>wbK?)oX5PwC>3mt*d!Uv(He3 zH|}z9?43#mv5%G7eN?BydD(kWE4ewJ_ja_>vY+If%l;A*rk$Ub5e!zDq4-+&!fYK) z5p%pjRJD^9lOZH`##fXq&fbccNS1P4sI3(fU93;-UHmG3=Qgkc;KTLfXL$XeOAyx- zHL$v~553Q8%ik9uYT5guf39A0T)p(?>V5?M`9D|b%Db((@qQXMKg3dgx%&;1~`|3#b-jw%Za`$BK?(!hCEAG|^0)lQQpi3;| zJ(HbPAIVdl_GS;s62@J%rXu`X%b10XZD!jdA`ZQWhlU;?X$N+0=$9|5kNU>~&n|WL zZ1i5>oKNy6UbT1(1Ui_bZnUwu)yC;btln&k z!%^ydRFu|jpP;WMgizx%#$5Fq8)a=AkZo75?T(vffg@jzcT!J393Flnqx;}DS!9B` zFG2-u3gqnQxvnStl?~>j{Sv|cWnGW*Am5R9_?O$nRO3gt&HQOtgx!MD8i%*K#CqKnEu84_ZSp6#=BEQtBT5<)7lP|%zau-*1sk3L<=_eZvKf2QK_N=sLb71qKYiol3? zsA1I$t09;p1d=$u2t0oHxPPY*>cK<~!iz!nlV`k)r}NJZ=y=sjEkgBr&FUBDMyIjc z1lW?laqdWJN3RM*1ur&W+a7sU9bb@Y0w>PTHW-%#upP_I9GgEAdGD`rvMOd|QMdSM z{AC+EU!%xz@zDTmGHa&}y~90x!XTS_L`3(>;kUGp$3Y+kXRUKUMH1EbNjUr-|NTE_Ox_K>`XK zzj_wZ4qAOPo^9-&RYt}L1Y+GaqWKf&$>v)vEWcXKuV9*M76O}9#PwRU_Fr_$&cGMu zF+aowtdCb~5eEcI-kf%A4QLW{8RhnAS?}=9!^)X_c^=Y!Cip*^ZT%RI@mT1J|J=*; zYoF}PYH@APL>p0VPizT`Dj{B8vBS}U6J4il>JDC(;nFb{N|a(Vl^mzN59Fy_V3^oo zjR=1ON0!~A6@$hwzhc}_E1bA|kXhk*co0BY?KR08s8h8N;I~1@50@j9m?;`cKCS5|Y=&DLU0XRRsoPd5j-dmAX3`jd9Fes<6E5-y zChNA;$Ak4d=(PCcSDYq*6uA0IRHt*^DMzDs3b~2!wxYrN@=GIz7h=)a))^x{<;(w*~s6K zmii(2n8QhVDYU>@Zp_~v=rYZ@urMJy^IM({KXtU*4K5nhEI3<9OE`-%OxE&S;B%bD z@EiDTT{hfknROzf#O4{^1kS}b9dmxeX7{3wx&&=K3jp`L&8W95$@=(f6cr!`{(u3 z@xp~k$)!qjy&yh&_IqBXGEyNJl|5OYeAxt_rD4Voq&yT4${Mz#OCBU^-Hv_sHbQ>u z$Hgb8R9Rmnjp5mMyhlwwU>wc6_LQKfns;Bcn{tqTPdEJ|hI^PH4|nIDta(e4eWP)# zd=jJw7NREaTZD#)4Q*&!O<70CE(inpE1v5t$}aEp(8sN1RM526Z2uI<>|tE$cdrrR zlsi4>RfZLX@gjy>E{bG@OSbX%!>mC*msOjNZ z5!VhY68w3iYm;~A9lOM2wrXrUx<(y4mq3~~O8*$@4!kLltZOPHG4Lv&SnlUVK3yd? zvou7G%nBNn-&k1%9!FwjRTmofo>x~Z&5MXbDVwF{P24dcbcdcC6P01diBm^89TJD^ z`&f zaI-sMWwO4uU?vP|bTmn5h}|vx)RA|;oo0-2{VZ#qLk$zsp^W5w^xIT!8L5l3lH>I5 zoFZ8cJ+K`i5Bo;{2IE|mT#-78h}8(#|IXPuf#y8jS0>c2WGu${2gJAUdS@lxTq@_;~^mb4&HS__-hE1Qr>X0KSUg`Q+KgnBuU&&^nYT6CRI4-}gW zXEgdf2R0zlUj99YGDHPUMt3g~f9T#{bNj$Z$OI{vc}&1zC0|XMI9GG8BE%OPw){&l z7Udg~lfC!GPWD{;cdkw>enj>z5X7CwtMb~^7rmoayh~s?#QidPlp8*E#SWRNgCxorP|P5)0^O z)|_>hCbb9O*ESkYPbvvQaG!yrq6c;9>l}LW-b`AKMnl#)O2n;_A}N9p`LXG)-{U=3 zG|orZt734MsG?t%1HK&)J*J}}%jE7WoFZwgQC{Kfv+ElnNd_b_a+)(Z`8YB7T&N`2 z?Af9d-;2J@?O1c44unP+rl<82I~yW-LFd5a{A{RlO+=%udecXaLC&nDt=sIMii1`* z8i>ww=fh5pz3pBNiF$`}j#Vo1Q)~4t&{Q=m^8E`4Vc?@6_q(zKSO2dd;oo2J{~WOX z|JLpJfd=Jkans`I?UmKeMz^|;wZ9$^4u&r8to6v(|A2be;4&&A$_g4i!iYyFjrGsv zYZr~2XC<6zmZ3_NSL{8H_i6M6k4!i}V8f2@%^`Q42hq#mft`Zv3H`OjbEx@6WGEf1 zGZ;SFdl2Ji(vTX$D{D*pF|3~YD-Zi6iUWBzkiyQEi)0&ro0=-X_l64azL&QK{RNt{ z0RD-$m;~rXtOGZgss$)BKxg&)SDJ z8#QRVWYneHevEV=>i=K!zxP>>*_Y(;+IPJ~r=DQKnuUunO1waF6r(N!#f?m?BROy; zMiIieLc54h62GFTu$x!kVibw){7Z*!mS0x``lwP*JCoMwnh=}mE=<1X{<^oJNj)aF z_e?2;GW-e< z0@zvMLsRrJ5>jB{iS@Vc6vJ`~<6YYM`mQolW%1^(i~f*fUL(Kel2! z{>bM%bT{u=O)7~qz$*{3OieS$09om;!5u)^A@qgsPBv0>KHy4R$ zxq#uSjb(zI)u|+IIcb9vMDJheL;?yf20}Q$Pw}E$7ny+BYs-h*oxUkZPjkN(f4wbF znaXFXlN`A&vkd`e;%$@u3*Lmd>L35C`+tsTrlzi&=4Ey%{@wnitEnRmP8yS(C9pAP zqSY7;zDu5Y)Z*lzBncNwb{*6%KWRgfef4)zAOZ2-yI)FbvOfYQc1*b9YRbKZllU2B zN8Fi2uo1ak9+0zQ5Gcy)G3>_g*7usubxFnG(bZAcURgU^Tif;EwQ=eBbG5~TCLT5| zt-6y`6%hi#h3KN(5 z;lsUEmiUVj&$}*}RPw$o&4_=7X~i}`(8Ut|NBdcXbK!QWCQjE^VixxpEIv3R4TRYj2RoRc7+1efdphk(3ddb2?dA@fX||=Jy=N0M8b}?+Hg6E&aV{5 z0QmkK0H^MR{x+HQs_#(O=};Do??yekZi6Q+#n-($*|5GJ{&Qt^(!9~>eKbKjtz&P} zeV#~gcx7Tl{DidI3NA!vA(BR)hnTQ|K)mWA^4l`RWrP8+GLWY3S@H-4ij&m^v$MTQ zyloqn?6=+Vxun*8^|}0Ef5iAA4fBJyVi$STUT`vfO7>oe*Fu^3AIwWQkKZL6%ymRn zm_1Fa-~V}ACVe{dH3}^f&=zX!&-9iPA+ltAx8#Y!$YDk1M4RkR;z^5Ev~`V(^iy6F zs~vNHfg-4@RsVoGg|FbZh(+_0qRa9lPe&UK%3}XgY4MMWcIX@#VfS?!K1}vDR+;u|?ByIt>)2Q<3|+%1Ht<|lruR|utBhmZ zdTMQnyU}JV1L|WNO*g&ZzaYW<@miz_(&@dQKR_5&8$BzFbqOG1*F6y0Z~AN86N^lY z%$l)G&-sS5tLF3X?!Um+9$FKs9Dna|;Ud+&8qSZ+WuriI+szNJ{yeG(N^2RE4e zxt9_BdQ@p9+G=y;S*wqyZ>GHD)MH<$ULVQ#7_W|HcK-aBZz-BrAr62-tP0C{f^#9V z2R5BY1-3w~9Vxo@l9~rxdQ{OkL|RSE&LvDBx5)~YB)L|QX18{}Ax)*5^}_XF-hP-> zZEF2ms3!)ym_T~YtcPt9+nVhAxPK%l=Ix^SVUO2RS^FYgBQeFyW9YLMO5h5ywO<+Y z@uq7`MX7eDzAL4d!1=-Xte}atFjqR{aSgI#dZl?fGa8a;N|>ZlKP`3N{p;A&5T8hO zqW#X6M4>}-*e!A~_@%_?=DF=1+72c`^1(ho+J2Y!jwrMV>S@O*xVYM~NJ9nI8j&qBk*SWMcbR;g}J^pQo?YeF`Y<)hAFs|eWKMGT>Xf!vi#t3^KaZ;FTEw79$UNQ)c z`?&I{zP!OjxX?^)io@UArGhXUYIT!yScs~ClmRl>P+K4;==KY)@>T6Z*j-*q;|o2M zY;wA%F~xOnE!m~rUT}&F229krI|ur!4%S0Mp-y?_!cUB^F*)*)>wuVLb|B~}yBe@a z_qiyqop0r3{w{2*`ex5#Kdj9P$FusKh~N%t8{_bJ@ReuSK8%3@W?5Vox?6gjf5AQo zSiCL&u&c|ENWI&KIQS2Y6#bg>7VAMyPSt*c$= z5DsnXs(v*_=Y=eO)wuw#pF93^TL0k`OeuGmbQuL1Ni_p4q%w_|7O&MT?<=o0_7E(c7u2egS)5xSp1T-sS8+PE%AAW4tEqd(z^b(O6+l7HepSEax@-Hw z6EXoG|Fn+nKg2uRUChnmd>zdYO4nGm!pkPXyTxf_tY<6DB9!Eo)3)CBp>ktld$rwN zdMC4ZCza&(qoYC3YQU{|U zIA0lQ6EoXTz3Abazz&YYs4Cy6gRQ>aiIV#SZ+j-t&X5nuEA__8jPZ`BogA+ zU?h<)aLU>lWT?ksWQtMJS5k%i_T3+hp{-+<-*2|}O5QLiRVj@rG;@c@OuyrhaM9>} zCc=FmA~oID;|v{p7DEBb(qPg~#u#97ptTc8*OIH9huJ-WCx$^SCUk2}EY&09677J-d#~yrLK9ai-_BSZL?Appj zOmo*rY|Qm7i=7$SRJ4_3b6tPgVh#pX@+MFrJ!XYXYMvDo88;5<l$2 z4pLJLX>Rx|UO(=xA!KMBLUb?JSVNEa2TbIe3M{qixr~j_o(gX}_J7F}(k`ies459W z7HB)|Up)a3`M}eeOaNHwn_23L>@MEeh-8^^OSu`r^Zn)qa)*x&di|7@3nYnry9Of{GDJ>$Aq5OhENUNF&FH zXJ>2kh-yZO$^${>tBa)xN7qI^W@N~}<}@?Z;+zt5#$IF(md{gI*4OA)OFZt?*+EwY z&1s03LUPgMcEUWyJcx%Y-f<Ebk(=qe%s zXLBACI%F>=lDci)f?Q%XZ^3d-rvF@STD* zp!THTYY1BYyV~N0YYKj$Ht;-c`katmpmk6gDbG$*9X1Cu9g)NtPp2)x1Qrp}6$=LcOxBTuha}Yq}I2wU=?q7PVg$09rOq^ zv+%5|*O@wMjDc`s&=td}m5IeCb!MB@30DdkL6HLBP>h~IRfr2huC<49hnJf8>pR`H zheWO04*(d0YOOE~22V2SMPm3N7*xL7PWKR4DkDcZJqB#^fTY+w0NflF6!MJAGuT)h zcSeZ)?p*&0ruYU>HKScqpHK<67ROMYbSSLH`AP%4Td$XVO|J<4$}IlYTwTT5bZxO4 zv4`ck5l8`lhp0Mid6;+E_raG4gvZZX9PfDkSp>dDD~g z7in?82Gj-1%0KZibya%8V|Cqn0@(B7s*x;k^lCZ(QAT|7&fYYv8W#^8s;ypxMt z_M3I2*ym(Ad^bBD4Ml8F-thpZS&e2^f!E9%X{c?WczQ*;}L{4(Ct>% zbxVLPf?CL*-t|0qG0c-#@k`cselHv8F~Mw7`||CuGuBgjV;#p|q>Nf=Ri$FbTsdb;k&O7EKVwV)%88b zMM-Nm0<~y;709Syy% z7yaJ)uo~IkF-yhlD+MY~8>!^#rN}%x#3MhYSN~>kwP-0T@4>0HFJ)@J;ZI#S7~AzU zt4%X5Ux#gznbfEEbd!W*a<)F!8(3vTI*mia#}!<<$N66aeDi=@0G?O!n*}uTy**Sq zasGEj4J{e33S6XC2@5vYvtQYZqt(l>d8PS-$jjxq8!+7bwn1;r7Q{(vw$jHlQs6G1C{*9q9-zFo4rDKTNqh6{Kuo`4g znN;*EgI-AfH8Ct&KS$}yxo<#EeGkw)6)=opUUs*K+7^2H9G35hsa8}xNdJ7lOm{k? zr`<4*t4qE@ZB_L_FAW-OG&l$=5__=XOQ!N3&742x2~tr7_~>bb!a*wXb zz#ZF1VzyH*%$0hFP;9Y*r;uV3_+4j?e=q_Zl19#OBlU&*qBb8%A{07jN=SD!X-E97 zJNi5@zE9~wrwnb50B}cJIhFOrcv+j_--w2cMQ<~ zf+Z@>w~JGB!Jl_k4*bc8zq1o?ZzJY^-y2+Uho&o2borBVOs=~{_g^{`F~Oy#TA*A_ z)$aryy}N>izIb3wSdOnjDuqLa^sE3sG3>4I2#)F`&a6s>k)UM+fr5m>F z&Lms+7-ssMn%;baHiKXg!kt<11fKnB=Ia8L72L1?iYNuh9AXWR5js9mro+l_BOswS z$d%sfaA!7~CEsFD*rccz{|FS^bB9-?Z#<%YQwXGGdnv|GyrX@;TwV673Dw2uv1^}Y z^+FB=R)!XqLO(6nb!gM`bW2&HHw>di5>yxT+@F-k2ow(=z&jOGw6@!e5|CO%>=RJV%B4`}6Jl3N;|KtmZ)J*;Ddq%PFqK5*iU{Gg zK{x&adLDo%i$Mf@`*2klEkdHy#EPuA{-^$M!#1KL}=rO6gx>6(Z zk@brELc5xM9tu+hgwwDMU9XW9P}eJf^qB3=+%_5`G+|*C+lbS;nuu>Py8o9)2j-es z(SzxTsP z!DNtN%pc^Ug@$zN2BmXr?iZkS>i!Gu0g4heg$@obkN{N8r7T6l*nm?RM>2fvWQ?;$ zCB}(VREy29aS~@%key0-SP({Qf3D651`i{rKd@BmsP`^^g&(gL(P58e%Mp4>61OfA z)kasT$L8^70tf74i5d zcWJo@!T-ZEAEaO1nKTtWO%x2(R5fr?mwNkn`l~Sq;U(vjY5-J_@L7UCL+_jkWe6WwGEMD~ zOS#*(B2j)Laex-2ZA6R05Q;oA6jsu!Hfv+N)_iN*J?|ECS7L$ACTEMCWO^R)@cHk1 zfm8aDhV%QZ8F>cwEX&#SPUpSYEemznKsKsS=z%!Y`>Z zzUI?itqw@7QjA3oTlay_8P4r3$eY;k|KWCjr>g&sH%^3MK7-(7q98)5=d6d2PEL0o z3xgJ4-{jyA`XTps_6GokF0}UF`30vT5?cW!MSwkbHJizmAyM} z#FHs_N<^@0@L|werV3+dpR{-A{b!(1r8Bg4R?fAxJClN$H?B=CI(Diuj-+{ip-ncp zo1KmaPESAoivSiRSl&BbZ^j)C9QF^$C_$&Ue$qyU!Uir4bOG;j(7%RRt}Hy?S#~E) z6a61vqRK1MrK)%AVE|E=$lkyu7pl!(yUt0Fpon6}>S0_?4_!R_k_*O64I-qv&PbES zl!FH1L@Q)Wdov&OVM=Dx!kjw{&M&U4p=fyfANk~8GP89IMPE5@-j|^jFG#_ zbQD$w%UR|YfBQHTB}6$0JjEN z;SZ{skcC6r80)GzaEZA4gs1w1rG<$fG4E-uIDLD%*|&2T4|sf=#iEg_tVDxJ(9V6?a$HTl5g#TV9sCdf+e{Zq?$D9A5vLLDy`J;_HXW)6uv+*dEX7 zJU1J32RX5>AZ<r{X_#E0g^;m&ZmvDxc6@ zCf48`3q5YLz+p{N%ik7opxzgmR}2EmYMXv>g~nGGgomx~c(%suV$D;OdZGV!4#%A; zm~Y!4a<60%Iwa{2@$B;I$BCt$)xiTaYQqY@F^nw=NWJiK5$rv5E#KRGY@o>-9z-~U z5GAR5=K!>GtH?~@gRat-o3Nrrb=cf-`J;fDQCpTIh3Aw-1D~wy>oHG zCE&F>tM`j_%imyohtdWu8DV!QtZFy|cbCNBTxxcn6pyyFP8aX2yQ-|_PFS0I?u1B9$)CYSc{k=q$+)c*nhO;&c6Gmsb?8Tm zYpyx4W!x_`H!HYZOsT^*zhp&htM3M~BU1uPaL2Iqg^GrH?AQ-Ja;v~BllQk@s>u@$ z7ygcC0R{@oQy<&+$92P|tH%1VsJ8xwy-O~_elm-Sm(ISBFyygcae1TWvPD_65&T;J zCedaHo=3LkmJcq3C91FumFP4>hwxQE8)0xfpUb888{|Y#z1FLV<^@+Wy(VdA|I-H* zQV*q6d2BVV(Lqm*O52WS@Ha-U7HQby-ml*`>WSXpQ9AKJ zD*c$Z{CU#@;grv$Eo8dYqbBvI)5vApr{$8JT>Xlsjb!^9-2Kh6FvU?0pF&4SYa8k| zUAc-&{Kii3KaP@UJM91ta_$qnh%qq#ODAXG0pO^*EvToOm1RPkgH)w3+#;0(62$Xf z3y3lS7x#jVcHKZTrd!jWOYnBlgX-P7SyS#D^S7Ypf3;vSTGaS)*Ipwdqxjl=Z}hT9 zZxL#8r5KRxb)EHDNoxs1SdopxTG-f0G@7~n!EPK&biG4k6_L|fsJrD17t;dWIiKz& z@$wvlA!wOP`M~AK66jDv9KWEBUOc>WM$hZO!2;0f%G?4dtiY4h&3)Ro#_w-WXQVr? zPh~k&UL*r)^6CaN^_dN(qGnm;JjhnVACm1yOc6g%?7htmM^b{m1@M6q@4@m5wzb)8 z_~GpXk%u;nDtzNA`m{NLYSS@2yK!N7lm%LX52Zz)Q=y|hMw+EuvCk+fWqnPQx85&&6O_T)?5Kj zb?ZXdp0S=u7U`Dmv;;8kk&^V-R{If7^%4 z))By5Pc!Uks+?@OhS`~Hg)g)3&v+i-QoX$v?t~CffhYH~Hf&CY;I6o1^b5_57tfZ?aP;a@t1g5f1NSE>QSQ&U)JuNEvkOiz*P? z(j>S4vpY3ZR1&4o2j|XESejFezM*AXi!pJ2sThNdmk($-ljdsNmS@%r(z-HLM9k2D z6$Wq~#AD4_COm=p8|TC`=RxQZ^K3%JFBqG#Ug9SHaDJmov$JGoGZRl?Mzbh>uH87m zqmo!4 zQ3^X^arQOJ*D8j^`H5{e-$2jd!MCt{p4JF0v_eI>fTOXYD23?lvn9Sm!1n zij|*%ix@9BoM^!HA82Gq8nHT-hzc>uexls)9kv8#JOY~O`C~@QY#k``>Txs{*D$6| zChC$3fbnel)m*f>!^XXx#m9N#tCJRwoOgsVt(nk`gp)jX`R7@+ZtFu=UvV-j12oE6 z#QF@cMgbHCXgibZY8*>o{xbn{5RZ8%nmg8%!ILwAh6ldC{TxQiEsF0Z2J)L#9@+)?S-tcT!oewZ_H+14@z8<9t!$8BRM373N3u!oxJbg`VVtYeb~2$> zL3Mg0GV_yL6Z*_(V4G^iNp0w{vUV9 z)QSOb086=p!pR=mg7ck`WeSECTj=X;HUMRGJg^JMr_hFeSw}{58a`|!dFi5B)##pI zKvpAO?0`egB!|9JAZ=-O6@l5^+cliqc{>H`y?`6K)iLs%(`77SgF8Cw=Li({#zEB- zsO{$Qk9T=GA@9|4%YXWI$)6lwXVV(IUeuN>8&HImFmP3qMnLA};4KX5LSEMIj~9X( zt9nmw`&vBw99+U;VYViBI$Tdz6QRu6FETAB;f(pRsCO;)3p?W=^En$HprMNQ=;PY| zUH)z5K1$LgEM^C)d|%aLiYb&<#oElA5I0bOEd%SnlQ>nye`~@bHqp7JLv{g&nsT;F z@qX^w82a3()*aa88e%<_3sY?s5^>)Vy3+P&*n1!H& zjl>TH$liL>trXvP8qsE3Nu>cIzAxuj z@>cKAsYP2y2%4D(Mn|VSC9+!;pwVMlzLP)q8z8a0C5mTTyYdo zL*eeS0Hv*8@nwdTqA`JKk)v*7PdbI53=B1@OaiIYQbYc@(tW9~shdp4HqMx{9^YyR z%-~(_-WotAT+Io`T(#>>!EtEVebwd7sZen&ZGg9iGuFi|Z3xf>E|X%;7pI;umc3G}OB})hWE!GEV_c+YMYUUmO**ZB zSXb?;1~1?rf4Dkc4vWC$yOfr1&m2u%d^BlvHA03kaINIV-7|v{fcRafUzP~#37;S2 zysuJ*cyY#EB38xEag_mhZr-sH6dclraAR6qi_CBoqR>e=b)Szmcr>w~_a+~9i$l*l zIy$;K-eMBu*L^Y&e|4j7r-uhyaDQ!gP@{LtGctLd9XpDt&!?+oa*lRis&iRDC-#e3 z-eFOLe%e|Hvb`Ai1UJdQ+FuRDVB3B!e0vA_ohS&@-v)9+wx>gCjMPtigsH+bZvdnRi z7&;z$HfmZK0s_{c6z>ScR59*3n3UJ-y>Y5m&F?J0bcZ|&nAWO}a5V4(%&6DV1;MIA z!k10>a{5+-X@p;-3XE&F#9nb9(kh3yB857DA29*m#i0q^fdO%h}`GR%<8sTozK~up{eV8w*;=RrQ zUGtZIlI;xJt=>98=@!%9p}=1qm4_@%T3_W` zPR_uOo-ur;28YD1wX(Ump#_Ep&X;vW-NXVTWApIl2h(Vl#{Gx2Yvr8@=eyDIJi`}& zzhThTj?eg6rYc;WwPg3^>$K8^GU7>Pz^q+|1uqx{_&l#o<4kTQP0iJ?qfRwORuc>w z8$Zf!y(`?HICB(0uQPj3?I~^7-A>yDC~KjyF-xrk z-D9@hO{Ohd-E%o%@HefJ6|}cp?CV#Lm<$q5vsW3lTeKq){>MI^wJd?&Lo0V}e=?fX zxH;G~?kx6t5>?$s9wW3@Ue@8HuE2f0uXQqSLwhazfxS>N2IT?R#mP8G zxWvHufS!|Jy}ciVbo}1KV<-GE`e}GHpg<+k}l>1WfeZ{!@+=XLHDq17dB%U=5FR`?pt9KSLJ z49-RxHes)tz;&M98(3JIEE(;57dQAKl2(4b6fn{^J_ji*)H#aA-*Zlb*y`%(yDlln znN#N=+xoxbU0&70&2(j%iB$eFx#hbXZ(>bP6*&v1ROfLK7up;LRpPP9Aw*99gAVD5i78JirfM$`+^p@QeaFz*k1k8=n%F(RZf18D9l-H^U&eje#Cv;!&B$RLWe11y~VMwcADF ztlr%es0=uXWRzwjQ$B-KELMNJ!1O$`h&-xIQH`ul3rW%Hq(!5+7PiVFkgA>o>!tNr z=gydQdGAAQv`_vtF+=kyQMdG$UhFbnwicO-&`2|6oZ&ye(>&wWTncEr<+8l`r9~BW z{4XdqS^w3+1Gco?y<0cOROhKM7M^>27*-Cb-&(9w0P(OTSHjmKzNZ`nTAM|P_(YpH zYUnzs)Id_mvm#}t?6DpZGEiFfHFAM8{b(jUuJdUHkxkBFl?@I)+4t#^mypAw z8hye{Xh3Oq>%{Kx|7{VAxgfHsU7BNLx%^rK?y+So#`KdNQwwd45HOF-A#>fhRWGIk ze?|ew1ENm&Bg8|Os9nr8Cb?AvWu@DHiHzX2HJwkD6}@?7Yy{?p8~9e)wz08Pb{o5h zOQxG`l%NMXooUVd@(3x1O~jdLOfU)%BS%vFyg$+)b)hjAO2l=dEw@MubcIX+Qeb+$ zz@4ZhX_gALy-cqQ{_gf1{c`cjvmIuaQouOEwJKUzP1RykHB>)Nt9HBO8;}Fm* z4DZgP(6Wl>AHYcbN0jZ3Op^xGfO-DwBuheKuYB&8d-YPYP}Hobw-TsXksS9+uT^iY^kM_AlC9O&zMV&iYZFJT{a$VUxcfs;hNO*^O zf@^p_;-+rA7$osNlY7H&(m(h4WFfmC%0eD6^UGgirnVP5q&z9I#JbC>D;{^cR;r-- zx17cFIk?pNkw2z&LwcCDuveV+7rG(hpsO#5$`YuqgCv!2I@%z)13~U<@vJ)D+w;RC z)i42H!fesy2^SZo5Frbu(+?lFjFERL07pT>6UGuBHE2raTU=^CYoPW*sFkOmU1`B< zL7PiC=rp*pL-+$#=Km7f4L|3LzZ#e9t@Qm#i3N;{V{d&EPYqmgvkPXaTk3a`|;rq;sWd+Q@l_SpMw%F}aPa8<5&Ni<{%< zmF;z_RMDDBO;sAxx8F4_W*d2f@j;58@Yvu~cEd37WZ|4s)1v){`e@D_p&}ME^`=02 zv9bH76w=tifQq6nasyk#azXw%i-Mxp|1FuL9Zg+>W#sO)^X>|k`HKqO?Wfp&3$X=& z$wAYk!qp-6{&g?VkV2Hu(#3OtK+U_CY(y{*v|1)q`@nB?ur4(l8;!Nd_|4^79H$|% zt&7CaIpVPG&fLvy&25CC%BdY~9R%slRbI_))mUPW5d|=gGY(8eJ-l>Y&x(=D9^&`@ zk_SH?HaRz!-nYdavna_VaQ<6tNf$0OwFjc7P~~p3Fj`biw=>D_ z%5G{DX4m`E_ra%kS+D`OA!W~BnUI_rg4@K%G9V=i9VPBKbDMS$2D{s-cJLzQv?B_~ zS1h$tr6>R4@b`4D;%iQ0Y=6;dkl)PznXd+vz{mQ~4`u-bQP`R8BQQfhECt&qvB)7@Q% z=S_U&wzF+G>53F7zpMFr9dNo!(~m`7ae|n}cNxD!JS4>DKf}THEjCAaNA{C+!iS2% zFZm-}XdH_jY>OkY`Git6e?{P~x<0W^F|Xd@@4oF@N`n^$WKUc%9Cvlf%U$E-dg$qI zw^Y1G635&hk*S)XDa=r8qLiiob5?s`@Hi$3KyBz}Yb}1dgE8@Jf3!e(LJG#|;c2An zIsrG#VpW*?w$jrrkquh)eHG~{e!|Q4LiAoF9Je)be8Ufx9^1VUJf~fizrX7-^ z@5cPk0Bs_N*ttxiFX>)8tF}A4mXcg)w3o6JTx`SZ!ja-L@@P3!B<85O~<{YE;WiwlUjoGmZxhOGR|=ZoJ)azK9*Ff zC#mk9xYv!yeOs}d>*iV|AaJ>zP8iX;jv4P%27Ltj*wKKe<_aqU*sCwZk_80)?C>|9 zkFr^>MIA5gKv<9bXNX#PdV;a3+Om@=gLBfX*xZV8?$ zktg(IEI`TkgMr-JED_vDRG&iod42x&h4+GX1Rfj1KalV_*s!X->tgK@ZhFx3nij?P zp5eqit5otpE-~QX%#2i9&YF(^s$SsNU()@*|CUL{8-fKB2Zz@b4q24q7}CY-g0>4V z4~}796ShWX;}g$LjJ;`fpwb!v5+6frIM{GfMXuc&p9wu{WM#sPSzr=n~+4`D!! zk=4!%mw@!aUq|M0-3J*|SuvJX@6Me>`?PEb;v9-8@cljjDTem~ihtH1(EnQfNa;tB z7;INDqnYRiQ~;We`RNyg_6SU6use*fuX3j1A~_8}1LWJ{{2~(p$fr8Y4mvq9yHzi& z(V{oXI^k8HT6F-e%`?B2 zQUKgl4PBXg*&awa?)BUs*IifR^6J-*J%Cpure8$%0xq%XObQximjW>TIrn@m)*B`{ zcrPNygmUXGi58m12f#SL$G=5qjVgsyIs`J9`)ckpql3(O?rN@;REW;L``Uw&;2OogcG@|Vw8R_n>k=xqFQ zW&bjIM&o^M5L)(T=^~s-l|gD)2RgaV@mgH;1gdL1l7lCE^8n~5*dsLca)@ewLGQrI z=Q^l(4eT!NlHSgKshzcUw~VNxX#yf@mUtO4&dAl2VRY5JHcvD?^%rICa54e;Xt|#- zrWveL_vmE(%&9=A1YHO z?bY!b+Q}d)6%I4_i{N*DmZ1CmAE(+b^XnsXO(&ja-<-+iL`<&@G0?mFR8h)m`?B9g z->CH9Euk#B*7rFq;+szE+(_hjb*NXw9kQBD8sAN_NM6EjiuMIrfr~Tx3I8$F8;E(MQI7WE5gz-f`Q> zuVBhPq&8i%U&_vCkrOrNfw@`EBN(Ox{JOwE$qGKuLj7JAGYwC# z2$reaa}it**uv{8Uf9QbR*wH1R^amt{u~Gm{W14^wb`>S-j_z>HnHnUv-)gfkK+^_ z?DJM`tK$>giV^PEledU+Gn>3FGNhWkL!i@$#$Lpk@FN?@G0IBqWBikzQrmBigd`AW zF8WW2uM$u#BYx?%KUHlT2MrA0U9DxpQw`wbrSOM;k0xRJq-wA2FgdnP`mZ{gt6##lho|F9Cs*!; zqwk=b;<-KneR-Q>xqaGMh0}Li{eUt#D`?^6z#WaFf_pjlM_LG{?8H`FIE$x`o!aK$ z#SYzz{Eutlrvs}c3mLCwS4rH(ptwV^*;TtCd!HG0N8COG?)9uLh3PWASmWkleXSJy zz!TO2?|~r?2b!Tj?@iV@IZ0Kyjt6~h?e;EtU(0M@B3>GGRRiCUV%vwmWL6I|OHnte zI3t}8+fqtz#C$N>Jzq7+#;*EERYs*aASo`PrCywkHN>@Gxw51Qj%za8<;GO)7ZzZ~ zruD$}yLW$L#+yx#8ZX5Rf6h18*<)O&EOWCE-Q(N$B0p?zqZ`38i3ZO(F|vN;mvFeF z-xv70Kt`{m0Uj!7>~-9Q?sxBZqsOK|&wY*Aa_>!kd|ho51}AsF@8*q{l4#@nKSkTe zj@4WVkzX8i$D6a`pqTshWwR)>5gEe0{Zt({brdCEvU6!tzv z8PB_1Iq+BfyRQMH_VBtz?z%N!n>Z_w7a1~#88(eV8Cuy3jpx$)pB4&gQrNz20_sBA zu4lYVI%=Hi0`j~`mN)eIh>U53oZJGUxrwyN>%P2v@=2Nq?3RKEOEnp*aBXaaPV^5o z;5gJY)^gRl7z@~K&=b=-IL(L45|L}ob2M)Aw zZ%Ap#V$qeZORI&ZYMFVp>636m{YM69A{23li!pUxy|MSPgKJw1H&Ms=LHprRS_6Cs zAt9zK>?kGR0{r`#G3^YLD3jcj{kvjbmb-3kD&iQcKWn8HvyCWEVbqV&rAK&ohxSO4 z`?I^Dp=WC*x=NW$&$O*3i`)%M<@NW;O2%wgr{~5Pfu@&2lgmh4foZ=J=D1rK2yfJZg`E2qY6vE~vq77` zMK~Q$UB=j!1o|X(P#(~wG4kq|`u!VuVcj=S9Z=vd5}2#7-6gs?TN5!-ZyAbx+|G39Z!seSCcdj{XJ@SI>! zF~E`!vc(p9FN++*9d(DbJ$wEtLk-Q&oBX9+%YPpPr+HUGoYb_fC9$F%&1`70MRTa< zrR*eS;E~d`YiNHkUWRK-S6GDYR9;qtckQoyvl)&{Uw`f6aYD|~9I5|K!7Bq45DgWI zGDb_Q8M`>C`Qqb?_vg5XZo~50$f_$X-@*<1-DHZd+f9lUD|VRr+6X`afSP;D_YX}u zv}qwA-#AEUdBG20rEtZKC_NM$y6rcS`A(sav*^i0;aOktjs|aT9}mx(hBU$zjdxsM zR6(8xFWP}NB3K;l4~pQ^Zla?S2y9YlyU-`AIng4~!w7=}Ne(^D_X(=Z$xCM~VA)HuGYizP3l~RHi|{1)E-)DnY_a=;&JNtegf~f&mGG8Hu5=VP{)#V|hRA z6;zWVF+EUFJ6L=C`vT&vY@BIvt@W{d;>u$n!Z!i)R-`WrIyc}R!VUc5c6qu&&@VJK zjqa-qoYe%!a$frQI{g!bYtq6u)x#rIBk7( z>u$R7-n3rxdAmJL!NeAWUdE4we7*C{6Lgo|(UUr@vg91h-+Ir;rI$Cn)!#qtCnlsg z>GY<<|G4nUvC^#tnc{+`M0jB03x-@rglc>iiM)UB@GP6Y4Iz;PbS;;j))1DvkF`(O zX8xcVGqpxaD{t4ocofkhsNVLC+n2;UB~A8K=~tqrrwF`aU)_yk{31N`MU##qzEKFU zDdu)0m9?Dy_Jc`JmG(|E0o=~Fb5xH=f>mpq@vT*8<_VZK8-jb=viP!iJU=xBJKdzn zGs`-6(btIY0ec1{gE|F(2!Hrn1g+~gV`t3EMw=f2JLRoOg9IZ}kNkudjWe{y>+!IaS8Gnx#;L6<(CyNk--|d{@{bXEr7HHx_`To zT=bXJraL8(5mZjiT3g|~QB-S|k6}VO4SnBg@L3E@W}p@;)}H5aP}}lC6%5P7y7vJ` z+X8BSpMez&|GwtChfRsthtV(iMz;66pDe2{$4hhH}_bg^P2F`S^HDv z1>T!x6Qoj(Vie0(UO;i%+iH9=8LjroEuAxlrn6y^-=&PNe(2mt;0j5`m{$#T%6=abD7}tprbV>N4xWmrP0v z+KzGE=VJLAJhQczsEOnnxwG04{m009vyc*~jfF&`(CRcxNG)NUvjp4cAx4u2x50781@I=E63@+Ez&hh@Yy3zmy@&qbbhx0{N~c zppa$nl8=`p!!1-p$jZSYL*1uz{`0jSF#by|>f9T9dxLHTo;pR3_iLYfL0FY@eP`{2 ziTlOGah4;eUnG7G$tJ4}3_W^-zX9)(7lM>e-c?T(nb^yi>NZvAf!k%-~h%>y=m3 zkk+`1E6Z4Zdgr6<`HuVc?p8Xr_1s~?1C^07r$J9))$x)D`Bs#C}Nyk~B=g{Hgx*=7q zO+m|Pp}Yfo4j!1Bwq$Xl84_gCS=FJ#{-DHZsAso0_IBW-O2tEWs4 zknS6qfojP=i{xc9=S-8gip~|HbTbsCIKz|(H8;5OaPDfkd`j%(1EVsyIDQjbXKO|J z9L|_tZrp;BoKTc($c#W-s4w@E)9Gjb%=yLz5f`<90;M0yyy9jZEdwRgRD( zTlBjZ;LH23{XDa;;9223Z-x_pNWrc)Pp=rYG^pL1zAYV>KKDvkzcN0%+NIA%)rGyw zQ;1cqR~27W{OfhZQb%&Hysg2mar{Js2g4m7)k%9VCmx-RwW<-j5bTgMo++P03vwW4 zXv>CIMx6+qH!)iJQcE-6diW zjS*)_`?KVZ&$%(Z@?>{IMo901`jnWzfJGmU=IA__ab+dF_aTzx9m>d@G)G%C^NL5m za`5}ClRwfV;JL~dZSzd&$=LK~0*xNK551yu{ZnZ6O|E8drZ}Pb1TuB%nIWOKU~j}8 zyvFE|2QvMu1t>ZrX!wjS>0q#_3>frP-F0IkpRS(&`8+v1Pjb z`Nx95`&*zN+2Qx_)6WU@g=ihLoM>+!xGt*_snuu1RW)PRkN5ODjJgttv)E%epONKd zOg}iF!b}`!vghy(1O_@wbA@7_)|bYZcD z_%Yk2AH^i<0yI*)4HhdOe0s~m0d*NZ9+h_qWqMAmb#n?8d~N44B_kbPlfM$PTPHgqTacS(a~bjR z>}D6;Mz$>eCrNAytexB^?#s|}0m7EhR2fyw5xuK!BKrd@B1#jaJcJgy3+hR@)4k?Xxq~iyN*@$!` zUijsr*7hak{irVH8GnqrrkkAkhGEEWS3`1>D|@W{*R!lBH3|eCNZZ;+UZj1AfMXiV?XiTs$UGn=mo9^J|-?Ld7oyE&jvNczT|?p9m$LlTC7>CE1E`%a;?{J1+xdCa74K>q--`;%y*(;jjNFMYg09E}Lc6w))@OB} zhkI~W`(ezP4P|nZ0OLss(Qwcx`86>mrEh9IU|?M>yhzZJQ-62g)w?>Kz@O&e%I7F% zWu)xM0DTHUjs&^b=aw2qo8}9r?tBUIK+`vo1X0q^JAIFbb3(TQN1gRe`}R5Haw8cX zC+#*iV7Gn*?)X8Dx`tiy7v+xZwC}Imv>VTtacjKmkF##LD+5sD91lA2TYfsdWe(?X zjifzyaYN{BzYWCxC7ou86d-x?s+=Je6_x*%uSI|`F<5B#>-3Am=@thXJKql@mSTA| ze*ya>dq0*$j#AII{{9d12Rit5ZgO=_er6f#6`GB$X$~A7TJJapzIMRuN9g7^2SuEc zs-=Wmn~>H)A_u|8RP?pu#57Acp^GHGw)0+l6Idm>^ZWRDr&)OL$D6sf*GFX>H!O1V zcp7#7{;YSUii9v(DtDO8q`YwAa*wpxru!TIzdN&-s{Fj3L&6GdyT7?C)uVw(#4{xl zXOE_!^w)^j&16cv^i35Tt0l^?eO&e{A-?o(EQGRoX2xGCT=DrBRoUdCzj#1_jGl|sjp0vyyCz% zOgM_PHs^tH8IPut_{na)G6&p?U6e3w__K;p4NAR)NRrY%DkPzPGVkWrJ0dmPZ_my= zdqds128Ro}*AW?cCt$C#7M?^r=beGro!#PI&q@&0lxcyxGoN}*1Ou;|iT8aXr$5Ey z5c{81?&IJ-rW>Hao=op#!}5#I`uuL;A`rA*wZkU2l{Xg`?mJg^qTo%r$%&u#kUKI! z@8YvE@!;zhxwG)2(&yFg#1EYdAAv&c82j$DOlQko${TEPFN;d5^rAHal%3{z@JGqT z53Znbf93J3b67Bk1#Nq@6?$ePJYC3P5d2UQs`_Qy*7C7i9FN+iks? zs7I@=?5~^fc7NErb%0Vy7fA3GXn!^}4Y%x{sN$Y=`(lh`Wu>#=zWTk66|DYT{)gXT zYJbdn@Z4%8_jP%4mxmN@#I6%>@i-K5e6%q-ceph-b+|D#c6X}0g<8x77%Io`+lpsw zp;bXQU*eWlPavix6-$;}IoGK>Kio!lWOZ+Cfvsk+@&1pD0WNw=pPfmdwG~qE-ih5-Vh#o3t4)vzsU-G8FF(A7XQWrKCKV6<6foDKjLfHeCb1VO zN5AY7(X4dl%`bNRx4RRG2RwG(x6|oN*mOA>TgwTS@TG1bx(vqURgSFZ;=$IcXz)rQ z)Mvo9n^c&}$2oj`GSeqXZXR~4jW5p`<;CwhE)H;IN(0`zKBjWD-u-lTI>~A7v;&UY zLdj{Vu-L|?#@uY%5BWj19~!egePu`wFc>UfBW&9yQqt>VlDO|jlIS?YdK&S;dTTZP z_^_>q-h6jW{(R=tkk6nlzc9U_8x)(62<#@*BT^xWtLJat3z0!_IKbb3^DqR&w8P=XXVgH$C!2X+7U^JyCz|Eo-J#&mm%T73S!XO z_m$tlKtn3(caV2thx|REtK*ywW!&d2D%$-!Nq?o(ItU66o)@2X&JHChq=JR#3%xt; z?pClI?xk*yCtYrjPo+zteNVA|CcHc!e$Eb@E}HXSV)Ie@CNE~nes0rl^b1)xqd@)$ zSLQEhmSDdd6>gngwC_0tD#v~tb0&yeuuggzRWik9hd;N9w8bk`u6a46`FY|$k(s|) z?D3zK)$!My}OCPhga+gZ|L``XUOJd&E{+ ztoWs>>F+9#4l_+-;`M(RS3)51$q84L?M=8gf`c6#EovhQ_CfM+vfdY$o4P*bs*BF| z@HmL;IOd;-JfT8o`kb2ITb3q!c5}zmt@3nTg*2sO8VU8C^87kgY`m#CEC0O7Hv*|V zx$b_j**GV#xSQ(K?7C99@*|tmgWxtuT3m+tOC-yP#xoF0g9#+)=e`_i1el>*p*nfH zrRqcK>F)^xKX5@*$gvVBetYcKJn2Nmvh&6L1ZzCFY#R@Lwp;t1qvc}RKT3yQ4Ntj( z1`JNw$aDcWp{Os@flVgS2hV@g;N4vNiknnQvy#%~TD3m6j!bf1bZ}iktPbFBrzz5b z4o3O7w`ztz_~7^;=Ub9Enq}!AKo>)cVYRtbGz?wKUzuufTF58;Vl%t%`!| zumIscrhu)iKo-BMx!`-hZwKbuoW*%0tuma1)tCJ?JMe>fA(#4fFOxNy2!)+lpp*X6rPW;2?p>Ynzg6@eZaZTw~rI`H}Z7(utY#K)! z$}~)YtRqMXVySZf+-@^x(khePB9`yX&xVqOyz9Ju<5lp6ELlnW&s$e%QwOL+SlpG6 zsaFSjB@MSa^*r_^J1xM5m8#eO`tjT+<*NKnDG=!yJo}Tg7%bOMMZPGUXmLV0iC);!ZljT}T>tdbegM+3(Q$=6 zZT^Hws4i{f9G$`FxU>~tP;`pvVlggM&mFgbq=X*ncYSXo0s0tjHKtpeQF zCx^v7wLwbEpOS|oX~$Sc_$WSVfRtlgxFA4;M_3k<;V;Z2v5#$mlPne1PKiFU|5+jN z`bzTI4%iS5byyn@lv5^2H*P>yy|5B*GFAppOf8v!sYp+zqMN~c;OE%bRM`AL59KV0k2h|H|$j)WDH;JpVPx2`L7fSHCwAl_Nv|DsbnJ`ThbXPYI+yeQRp^Ieknq zUN&#UJ~ zds-gh^U{-`er1cg^|{wDWvEP$JX6a~`l0^8{={1q?}znEiez!{Wjs?3+PnR0UtFSd zm)RaDL+srI%>3TX^$k1l&aPIq(&3k$PnkL^BDC-dr4!GX>(eZ~Or&s&_AAMlgz4il zxdnRI4ufZsG_x)J81>t@Q}@=^vXu!mX$ajpu%F-DG)kna5yrsr-jKmpxwJ%*(zkaK z1=0iR|GdJ&UNG6vfd0Wij%s-davIL{M~jH2W7WgABeIao24I8tJeA0d`91MN+gYa6 z?^Th5G?`GC?-=W#hz)3vu`iIPHOwo;TxK+zIWx+b@X%czyT@6fm|1aRCsJ-C_B`+C#_O194++sfJ0`c7{BU{VlE~hK6MNf|H@4lNa_la;8eF=w>4MtMJFu#K$`I34g}oqwIe^IItiH}k zCrp%tbf>IGNmzNAc!brjf9hQMmc8bo9?mKA6V6vA@S9sGDFUznIoxFl_efRT*aB2% zx2jc@;lX!>tZr|m$1(8^vlo+w)(7!W7B>vNnSN*;5aFGWNKv$^suhsx8#CY>6n9k)*3Sm5}>k1El zwwS=@xfb_zb9C7d1E4Leg>Pb7#(dn3882Z!Q+Vn$B)@oQd;(?v-(!bm>MtP+q^OmMS(9@h%2)7@3I9$$1yaeYfoM^h83B*PUa^m$~n zz$CGsK+|&x@Zhf)%fG!jdZ=VKWyqL3FMjx$$9Okv&^0?g2P1-4Qya}^7o=6k z&St#>3AX8Dqe+Fpjf_G>f%4CESpqK`0O^75pPqbAa+CROAA(U3p(q{TRDoWD^psBbcNw?ebWF>es6D7hD82q;^uiKvC z!^vk{HVC)u5yk%XXsSyYKFf-;w&P{cI<5F!B>RP`=?9+H5EH? z@tg@-Ok29FbSc<(e1)15k}y8pZVWag)kkBENht#857q-5_Kf}I&ReHs0AS!aOWq(} zQGBd4)@?a4xUE$69Z*Cj%pdUZ*t(~*EU8<4?R%VZvG!o47eW;W{V*mc1KD2H#=c+u zUJHO`%l964qGUt(*vD}hckdVYDhv{YIrMumihf-D$wyIPH)~yOnBNYW)==xm;FOr7 zQ_{UGyM3IHjvIs|Q0Mu)XkXJlrUz8yoIk7sOXux;k6Jgew4-%8NTH#by+g$0&;j~J znwN2bjDVfesC&8AOhfSK`;}B}>-n9r7&CPRvEAtjhnX6b(T9Wi`{Ipl;wQ)!$`3z& zFv=2Ztq(-mm6a-yJK|oH^=!&l!Y-`6XY;)Q-8bXmEl&Kn*P(%C)qd`9YQM8uO$K!# zw;e>VM16-}>hqkeDa}6FkQiz;0Oz@{dPLfE*+__R2d-%Sm3})=`%Mu02fO!2_?Y#{ zp{ew^PAgo%1%I*qrqWQ-=bKR#$d(NpX;p1KuIVSKvetd-?)AkpjFJ(dYBGf3Z13w= z#&Z84sROxr$Bl67jklUbmoEw!6ef^$p{3r7S8-^a?56SN1CQ#}xekYlme?2<{IS?6 z3g$-TDC3Gl39-z`s+Nm==7a6`-BsMX=W}{?DP|a0Aq7+@1T2G=!>atMo97-q zT#lv~E>`lcE-L)8wn%pg^c=JzqO_nob{MbvPf9pLF2O;dGRfp3KohoxA^bhz2eXV>lfH z*4*hWx~}Nt>Fc)A2IGe9HNq86TP+h_e8h(6?&u6>NhwW=Xfk+Kl-bvJCgdv(*XGMV zH{!Y)sOI--=P?^X_;pP(W$98alVGw`yh=OEa%nw7+_|`blXgxC2mZ`Y-8Z=rWq0)g zD$H|xM1#Pqsz*Ymln@Lx(4$I?@X)QcnZlLsXtJ|m%rL3%SB?iHgfAw)3wj1#w9qX2 zonol+#n@t>0vtp!iKHgMogXi4Y5WTmlkWi69z$;)s_Fl-5jXEp4onoK*i1@ zrNLD3$#`DX#xf}6K-1GAwuIy8ho*iUhCrac*>%V8EBMt=<5pgMWmj$pK!OX3|8anI z4w~!R`@%Mj^HyaRx8MgC{pIjl&mrtui^{}VOCuSmpA^U;ErD!ee0&_Qhas?RtoW%+ zg;@ve`Ja|5*p!CU0AGqX~os30&kdoRh8t!)LMeD#pIpeNx z${Kb0f+=^Eoqs>TbC=u)DzP)d8WT5Fcs`om>jGrFA@TTHK@|Y9a2{h0yytlqyYGK; z-MFW>VeNNV6N!16hTV4OX2t)s-V}QHIcggzPl|>nM6P@)oh6o3y}>1jF!5`n4md6s zgM34tu-gv^c1|!Wf|M<$;QgvF2Za%!OXpi70Er#|<$LL%(FT=UbZ&?Xu(lGaJ{{+f zoS#o|Kmoyd_TiYu*()(5(2Z)g>V0=Fh-LRhaV4I)W#FSBAoqRG+KF|D#vJ1VAbBDm zFvg*eO^0l&AUl#zMMVX>;L*#RkT_PXi@~`?#79b+e@UwLJgx~IaR4AP>#fWnGy?(X zr@+y=Mi7ooUmqORQa3GB#v^AH^`N_EP*#UPA1LiYo6AzdITI=4SPqE%*@yGKl{sgh zO&%A59>#bxOyty~D|0 zpbBFmmA%7dVnDRs0r`^{cjz>B0vG`tj=ePy_#>hWqSwkjft= z*aDrNFLB&Z+nboMzjVae`>htsV=6(Qs#h_;%maAwQ{qeXbZgq0yaMo3Q%Bx1hZ&?* zskawcDz(+T^l39ifK)r&w=3fRKPtAUX2D82-T;t!N# z%!c;_S$(}eitN?hRH_oE0e}MA`Z;}~kM*sw$4pAO(V`fOOzD`k{8pWxA@D$}PgQMN zRkL)%ZP9mE}_ zmrUAkz`b@_T9WqRZxbAn0Kgxh?#AQzipXzk#(z0x+RXGU4vl{)`NGL6=NYd&pvlNr z6H4q81h^Q#a6D7eK@(g4$;6)Q@z*F>=j9HF-`=}xSigWb(3HJ?X(nc1`0Fdwu}~Dd zy=_i>>BQt!>)S*no|6w#U_y|ZD3NmEl6n9ykJbAKhVdtdp2Y!tc9&Dp==7y!rp zh-lE0c4^Zg-m5fhXfg`)kSqux2;~RBh1{5n3uDp?CBH3ZGsBAdsnR|9 z`6xG!;M@Q!lTM=ox(g~AsE;8`3AhkV;Q_x(zi9Dtv0W%BDdFuK(D+j_Y!O)Sl(S@J zsPJ4(^nV+Cu5#{I&1UDo`uZJd*LC?P z1GJdwG|F--Bg8t{LD+WxcbBe8X0Cw4thOhh*aSQ?zbo84s#Ew1rJ5#B%9Vx>+fO{1 zVkRnM?@*QzkecY0Z4<;_0Ka!HtzV1p6nZDePW7kKJc(^u4l;u*IlXmX8MvFFKs?Zt zI>Y+(cwT($@M`PU!PrrEqP_6l=R6ME4BjGMgK)!p75$T&lW#Fr@kMuh$*v8@>YAsj zzev{yn~`T|pwDvx*i{CI1$!oPF1}<+4EW9NSCRM^ir;VT4h8Cro|$}kLT?=%Q10vN z17Nf8^)CPjgz8LENqT=t67kMFU4Jr^1ZQ;Ml}}t=@&7!Q2bm>6SI^2-8_q`N{fJPZ zM>S$ZCpTABoDP1(OIlADiza6SLp^rVdHW8x25S$1yNTz;ZJk`(hQL*A6@Rh3Aj@;< z0!Urea~cl&pSO&W%}p_GaSrQBq*j$phtq_49<}#wwfZMga_ieO_&LKRLawb7pvG~5 zf0N_jd9{ZTst-^8)dI*=gscDAFwBocBwUyJA_oacp2Sy_S@Y=3Z`^TjxQ)TWV> z7c~9L;no;x>ncP7MJ@lKeG4i7K1o@mE=0XBLIR@Dn{bC5z5Dxkl;Ndzq0<_0!QfOP zme(%uUKQBW6$u6J^1;uOo+3>2x(WEr)!ygnk@pd3*2jCCa!j4|eN;z0b=~?hJMd&k z5V0=!UE)}z`Dy=+s+UP@GBJ-02}Cv*`Q8KrHYIjVI9>aIx`w>%QHEPZ=zy3qA!5fX z*mMU-%`AC1^&$peC_5l;E$;bt!)xn>$!3K5C7_R}fmj|AG|gP(_6(l^*)QrkORq1{ zz@5dY(m7TC&>|?rjghsK-VkPTYL5&f*Q10=f%<|x@Awz20r#wbdF=xQJpYI|aH6MQ z72Y8d=EjtiqoIs>&3fhU-*M1K--7mnKl?p@>kVDTS+rsSlMnY zAQ(Z*UxDK{NRT3Rm{|j$`My0&7HTvQXV^6Z$adL_8?hopga&vZ=E*=xIdC3is|9~?6xOr zhE4jW_ohsjm+>Mu@K^FLO>m@>ogZqPm3mETeO`$j>~C%kc*X?r>m5#82HoKD9vmxQ#EkzSvrjHV|OSAn0v3I-^N|dSpu7m$aY<=_`$f@8@*oX%n(UyEjGdXJ( zd4H18iX)HJDjf6kb(czE&*PUx#Ce6+Gsw+fEHL3}NjRo7SQ1quTj%Ld68mNA?N;s^ zY8Y+ZEm95*nEo#e@!vHK4t$awe8ihhJ%$1QxG2F;kjbnU=4_0*O3rS;QUZkiaz#7` zBEhDsstgY0Jk6(0Atp+Gi(!uX4tXCOodD@O^|Ld~=MkS+Sv*Efme4@7sG)tVq|d)D z3*?^_JCY$C*7)W4s*{g_EIm+zNESctCC3Y)0uw6Q5|^@M(E-(F?ay&ba3C1#gH%47$TAwwn8V|)XpctH0sy}l&ianUnjxYR z&%9pemhj=XlK<_F zfp+HKNd`pzg#ooyI$8wOT~@)i%Dx-KatcrFQdP+^`FhgR#3K3LFfqsG-S`s91cpUknKDWE}oa2=SkN_up58jX7s~O{D{T{sw+t+AH-c6@f>j#2@QdPKg{T4C4XB znxo#bQ#567A$JwFqcy()%g`wRVBdf~rbX*4UUmXILV$DmU#qD2Hi?<&Md5sWUC=Z9 zWH;~cX|FbDVSCiLDd|LFE!)vDjegr0!cLA^jXzv$w93ohEQ2yB+3U`KoyWlpztttN z{g>H*EWao|0_se2B^o5d_gX#|%0wWL_MK=jZH))d*TlfAzH}N#20GCMNp0E#m1jWP zbEYBklIDGc?qe%WJhqm9h?svJR8Ms9N+}=i5L178CE7Fk(Xfs4=$GQl8ztaI7;lTu z8c7N@C1i9*5phU|Y{oMK6IH3-4>8CNSG2PKoT@EI^sh_$M?LZJ3821O1E$61=6{oS zs!#$n5$}qro2A5_aR~l2fy1&z-p@NCLOkf^zHz0#*6`^)7Z6f>tj<{@A#u5|p##y5 zNC|1r0j3i7{nvwX8D90(*=ZAr_~lCAj7F`B9F!hl8GU`eAh#D%uHtRzh(rj`OSgi)RDapw_h~?)lQf?KOd&+}DiaP2GO}UrbkF0MZDc zvOK*}pI7*0DHnOBMwjO2Zz?C5Zdm1kZbU;2j^WLX+jpdeu08Mn@tL{dJSr7P*lulp z(u+K&Eh~1WTu@@5&gGp(nng~Vqt+&mQ0{IMfTMx~##r;djL%%a5r5CWY4of#3{lAB z%&!id4|ovp+()=tuYZkjRDFE>EDA9#cM!=k)A4z0VgbPha1_1SfhIB$jTAK+Ov?IP z>Ud_pGE=7VcQ;iPp1LSuV$ar7z59L*KT!ICk~Px+rr9oE%XpY>#wG#E(#7k#;z4&; zKb~uhd)(VIS`Y7f>B%FH7He9c?%#a<SeHK(&}0ro&%z;TeE|-fJie-e9!o{$FlJ@l{(fGv_#bjS#jvQ#){tpG=&uRQB{cqjrJ@ji}_PEk5Cy0ng>A z{+T!rMrNK0Sp|(y-Sx~mGHjazQYG#N6#!-Y_pAR7i~gf(>^Xy=b(X2*_nfV6`;)$3 z6P&xhQwSMa0fF)BP18ulCT>X!DL@%W01s7<9!w$0Be*1*yWyGXVph7U-p5p(7J|1# zoofrqdJGJlXH(4e?BjEx+Ph4@Ux)Anm^X~xoaehaMX|j|ngM_mgv4vf>CS)>vlW{8 zdH^SBovaCzIx?aWWB7ww;KFY`F9HOZiffDupNgpR(nyQ`U)+yfu;PlqBR48%DbCwj z3|emn4T%atv3Le$3ci&=&yF}p0c~8I0BFUgs@$24Y=sH%!W zS1$?m`-U||vRF2D_Mq-NOKsMsErZt6?qQ))@5NFBMh)=_S1#@LNwxS>O&6A)Uu;o< z)c?ndU(3a`?&tg>$$bgU))*amM<5J`2!bD2O_228BM2Eg64AC7OLPc8|n0Vv+vq?~(O1%NG7 z%F~Qd(HCbal~0a^ymv>kvlcaZ!ZaX3^e@&ozHyUI!GB?$@+Cf6mw4=r3A^#i`y}}a zIiE~;;=|(sJu813`7@0xt^|se+@NPvfafxs1k?ntnGL5WILTnxx0H#SE#9U!h1XP( z5C%Bzb0fRYn5rY7_m(MF`Q5V7mV8{y<3FvPQF=If0CTThhHqekSTcw@Sw zDsEJ{xIW0%NO2{ZZ(o_6bdArE_^6{phvC|Oq?M^^OPXPK=zW6JsVzKLm*G)q7$a7e z$VzQqEznYHZ*HMdx}h>?p9waG8BN*g%M) zr{Cd6PfI~0(Z2N}b-#@dc(%ur2Hg$+zxLh(sHv{|8;u~KAYegIkzxg;2?$6B5tSk+ zy+~0&Iw(~LAqWDGBA`g`9jT!PsS#;XgR}r4pdvLSk&*-wlH7yO`+o0pzx%&!=FXk( z&i&57%o%1#_St)%wb%Onerv6bVg@NUAQPWCwZ`HH?xPL`sfTn|QWp*6==MY1 zT^E8r59pDTIFw`#IF&%#r$2P@iNk~8U_o2YUMM6wc+rSE!#*&qJGA~suw0uu-KudX z%O~)WXV~PD^s-P8ty9WmkQL+}P<+aJ$9$S$RMZpID3nT|J6%u`KH7^I5imm6439T6I5Whl%n)I_K%`d zgE58))2L6s>ok%=_YRUAoDRYc&g0E?w_C}p(?7$KUELzW@M4iB=N>;l2n(2l{|JLj zBA*kLOrHJ7p5zyMeS7(%NUl_|MC4G&RrYr-e|xf)dORTTW_r(ZA8sW>Fo1uMq-+hh zTP$XgPQ7v*FoH%O2!+vvr}agEM7oY|dTa@si8SR4vU)sT*peM^!QH>bE~8W!%}-!^*b%5qoOHYC z*w{TFq|yv!2Y?_NAa!{903jvGWZOb0u;E36(!{^%QH3ik`=`uIV$z#Xp4a8@c2hqC zou@@J@1OM<2xYFjg)P?$oLs0*0?^=pd%@ZyAHa^AKXx~TA~*J)hO`c_y(>{v^};tp z?fNs8#(@aqrwMPGp0JtJ&F2OS%jo260AG!`J1?j+`RrYBFKykgMEz;Jo+@=`)d46G z`;@?3r-}VI0=JH@*XsSxf&afJ<_QZK?+TOv+({Gj2}90+68*mDWdj}{a54V)9R&m~ z#)6MS2Ru!L4S>R!nrTd&3~K5UJSdGED9pKjzcj#q;8nX=}$6Oz(oXyLs;uFO|g|0=Xv-2qZ_&Sb-++ZO|@6*c7QcAVeo%80S*iD0e9|A9)fyFIF(-~WkU z8A|1+o9}lzm&nZd6E3Qhz=7@l&Yo;UsSMu3mX@f!oqK5YJscARS8Fgmx%NkVBaQ@? zOx&;x}CV2&KrG-h3bh8WFkBwwVMgPa2H$OC(0#oong}FwfU`N}asrIT)fjMFKE6{+s z#rltM+h9;j8R+{fgWkHTSL2*gwIq?Dt7L9EblZ`n+Rw$#uEnzz{%Jov?3 z&K8|gJQejX989PVTEPcQMw%Qn{VXP529R^;^JNA?NUi~`tyQ)Q^eTJ>c5Kqq>IUNe zqH+OUQUzWFL|?yb$ngRj0R*gfZK^!K0_#(-Ev@Vl4kEfi7qQ5Y7xGjP2y%B}cHqmc zwJ$nWuJGicT3meV!B8#LvxG&rbQkCjNezL3vvM?*Cao)=s{wNrJl_ ziuHgtX-UE;kQl-df5{gGX5pTtMn(-No8?CJX=9S*#IaiDAw3ni-g6fY=4){Q(#?xL zh^D<26}#Zcje3tk$fPsS!s1Yk1KD9CK<06OMY#MGZJqEx3YkPI_Gd*~!a`;|vnMko(lEE0IGzSd8-i;UjyBBnHcRmI{VWUT> zw$}$)nu;Dzd@FIoP?rlm==GBjB2~zVfToc7tR?Qgtf#!xzL=oz=pIVNd`<8XCw3^| z!B8*n!xVrOdHGOaq?BfUlhT#R!_TIO?!Rew`|tsoZz!7Hd;{o1pjUJ+4P zrJgJiY4GOHsfWxc`3HHVxtKvLacTW+e1WB&X97s=C=;rI&5%aoH)L$V1QQ_X2w8EE zf|q&{Ht~TOfX|pcUh$?GU`SYKZbO2S z@3ePqP9`w&vBWAJK#*OZ8`jscaFE<_R^}OMnVJFjlAQcqKD%3(aQ%-?WjpWr@)z-^ zX%GtGF*A^|_}d1Ij+1kl>D#Su-+`*}cAg8Iok=;5EjpFV^Jn)SI5&V)#k>mb87Thl z=v8+2U@)}ou(2~x1-r>kkr)}xI=|w76KA6U;^jHd;V9QRC6fDBLt`q-p`%4n0We$= zXW%rxeRz$mmPL9{{>4xrNCtoiB?Cbc-1L@JTR$Qpb+$4qH`MQwBr~Q>rp?|`GWbA6 z^bhIKp0tEFeFn3_{o+7n+Afc1e|9oUeR<#t5v?i3fh zJI;S&WdnH)(m?x6x4PO)EM76_dUnC2ANZ z6ncrYt*3M7XtvmW7hP?y{(+X5U3Ep*aIY+ahU2e6dyUD^pL|UvavzV`Fy`5}zxoBB zrGJR%irv;iH-vtIY_MNzNRD{iB6@T4OMTWoEpxV=F^@+WN@Y5jMBCk{&`@S%F0Zjj zXRO%U8Bf)k)sroxWbSgQ$lxA>J_ z_uO=*SaWPq5hC-=57rhOt3>v=oAToJHy~PY-$rG29N*jYfz3PoONXLVG%}|6WVCuY zh5;43YiTo*q1u_ES1zMor;{%F{Ho3YAlo_PupP6pu&x7aqd&?G`M?`sI0tt`Fg2n~ zVoYeF4${fGf!?ootnXswY_xKCF7J(X2q8`@k+#nh>+ZE^i|j@LYNz~JQ>s>q;%*c? zomNrdm3!UESR-?+at5kx~lKy7%7Y=5;n}efY|~zX;Wuy%Sdb5u#wtWXlq@U zsd6(QjC9}Jl|s$g-MBtAJ@IX4eM+oX#Z86j_&pA+ovM~g=#C5YG+5Q0zY{iQ;#2d! zD+W+=tc-1!OrvcX*Fhh3fG{l9Mk{h7Hj@^+A$-LAkTwa^B@x=uq%^|gZB4*&l$^W= zyI&9W38d4FHjwjS4*XE62I=QVk+g^>Sm~REtDkZd+kv=@?RF@imtu$|H~0eRrJf^3 zTV5YH>b(K?dSrd`-tUSX675%?bJ?|+wv`egbiH279m+?kzrmL&f17Kr@9tX>v}2xS z3Dhz5#?PTuJF0xX^)-JtA3ZB*+gyYYvb0gHZG@c#cZ;0V(7<=!J9H%CIABu*%hS2) z#(2niwp!Pj96sti7tziD;N9?nH`zx6drn)VZ6h@n-tsGs4CSb`o_8F`ZYQrkZo!Rx zYy15$NUkmH{^}yn9OXNn1G5&%{t{~01$QFfw;O40yf5>nTG%o{CuUR?!{xqD$CxQO%s>hE-l-6t;mHuvOM&GO93%cXFsc5>$pof zs*OvVjd*^VyYjCSEBB8| z0ie4lI`Zd3SB4yieKJXk6`9JW`$S>}0!^P?*jxX0;KObJOB|ahnWqXMh5*l7>Gltw zJ}Udg&T+q=n}tJH(ARRAfH+Hg_;s$}V?$N^=R}EwgF)KQvRNg09{TnVa8SQ9NjgEk zJ(^SA-ip!xroF_jVueo46tk!L_ows&GXipmsdOm-1*Zf~-3VXTIxcY~H4B`0R@wLC z-u$wRvt--^ki9?^+CA#G#*!YvOV@66kI}w{guEC{D-IOC61n5 z6k`+SFiC2jQJj=M=1&>tH93qvzNGP)1s8mqHDx~EA$yyk=y}qN#-EBOai$n zkqGCv5f%%|0S)aEHXXA=>4>2ro86%Lb)bxxz50zfY!m*)qm_{H3LCcZdJDyRh6w3~ z%lXZtK>nQCHRUPa`)_-5>Wd@>S=iUezrk+YhmIrxCw(*5EO~Baw@!lZp8gyz>2LVq zxI`DvMBU3Oq(CC+0aNngXRl7&g4CYu%O{qE@PQ|p+Jepph*Ui=D~C+E4^q9uAMD5+ z0y){voiZ^!SH-dwlji}%chTKxXAWuZ6+_qKXm3SR%`o%=He zpxE@gf1K?H3D4mVfK3mollu3!{7YSO@Bet;{a+gKe|We5!3R9Bc|7vpT7Z8~_y1=G z|3Bh@D-8~EV@xb8TzV6Pa!C)KfH!LUhIZD*W7Vs9MebNpTHTMGi>2E=T&vYu3>qAx zYAZAcNNgpA$aKb*nsw{km`HPwg{ zCd0Fh9)l#xL)X@+4pvaz<%5^9?vd7qB}cI!gEFLp0Qjk{9c45-aMq1vDdLr^+5tF8 zS68xwa@^eAW$Rsl(q*EW53P+FJKWoK;BF_Y5y^}?hfXFrpv?jGjDW;0VpwcRU3cM( zfZvde!tavEM4-!CQ;+5HNVbQe(h?8V^8ReQU<+8`I`|0*--z3RBR}Fyls0QMN(ZoO=B<@`61a zHKrJ?eCC^?LdI^+voA*_@ux2vfgF&zwbEQ~ zSimzgE~!hU56&@cgD}o8BjvTCy0?dx_}~Q78J~PeK2)e^g%2FR2`bEpNQs42y53c~5s};5YxGX7A<_=I`d`I8d^&LAg8iXogkh zBr*YTWK5X}3wBtC(dW0*K4H@fQJd9{cuR4^HM0js*9n{PQ0l}P4gn;Kn%&^H#fnCR zbMt{4Q&s-Mav<|@GtdgqwD%n^4pAUD$b%hTLo5sv=W65Qwc5xcueI*g3ch|!3oBGd zT+VpFn`$2x{2{Cz=;z|`yw!W$c_2xQ6mpQ9B-0W|#)M$2UJfcRC7EQj1QF)j=H6er z3%ATc%{BXF5;#u!GotwHrrQkKc`;`ZiolbJzS;8cO)IkG9Q6IuOECT_r*!3DnrFW*+j3ZkpkKt|LC^ESwV?rU=hN4&BVf z^vs(y>VnZD1)_ak^hGCK02d^`=<#BnUwS3*;qIOUr%IS~;Qk6knocG_j9kUAXp@ZG z;;>@%6upqQOYkVs?fXsp62n5(yFK;;a8JUf(P-$kRm+m;+6G3L+CnSR5B7ru)_ro^ z*wgz50x`X*qg2Bt`QeFBqSgmf0$bQ<-5gs(P}JzaqWM4`=+5-W$7w@}fsv16Dt=9! zZrVG7*ClLQ8uru~(W{s|c1-Dthf+LUp@+^(UwOgcr6ar&YgWja}+#=SAg%;D9IAY^YRsR(s{q!{RJ0GqZ$&_zY3*|F))GOF>Gr}^Wg~=)1(st z?K|U?GYYSz8W<8gXE%CRg<|c*S8slnL}FHi?j|mtAZ+hPgjvv$MKwr7%>sCIyREaw zS5R1FGgCvIAG>Ll!Q&DIz4(iUZl!6}I_7@FrUNt2ILk}2){St`>U#_1`vp0yI#?O6JC^yfb+!#6XTuPW+Fo`)?YZ+eb?495>kSv?O4$jX$NN-Hs0l9N87woOgC_uUHngoE^)ui0|*R9tX;z+C_#&6MtK_-e`HwBpW_=lqPo zRnhN4!qT|ack?E_LaOps4$~wP-`dKLx@(rz?q$ZuPBEiE80?4&p#kNR9bo<){9G;L zr!xN2qBRESifRGI;nbFTtD@JmT<(p5tCm@pCN%?Sj7=fNi88`N8&`penJqXJG+S@% zZW{oD<>;)7=XJ33gw!@nB4}vDj0YnGtVjbtG2pq{ve zkaAP*wH8Sla9d$p zTjrn%Upt_k@~WV+XU||qwYlTe1QlgC3Iw9gy=?t%l~{||Th7$!D~WzZk6zSIk*Gy! z$8d3mj)YNhd@XIa*yT0IA)7sXxW)*f$6V^W-xY)&Ov!p-l`j?Ma+x*=mc7TnJuA1t zh!nJNr+_AOTuLU`zW?lr9nMnn+c>Nv^u{F@0?+;!!hE{8h?dQjIF}wb*BQYP{VV@M z2LRouelprN_zVoSgfn~%WXblWz{}xh_v_`~UEDsz;1yE7HTg@%bSte0t!NPe=Y(8U zrSC2-H=^w1fL-L!pHj6D_sIX7&T@uFwb<@-o^(Ls`Gng`>gX@(AG#q|_xF%c1;AUg zi=}RbnH%1JJl`oTDT$!783jj3NNjZR5NKR(}pQ%0tH$AEh4pNFo=0HeR;3}H=nI;4Wg{f>} z>rABZkrX*)$2$*}A1D>c$c%ND@GNAdnoM&kk4zVawyHUN4*!f2Et)$kqjl(>X*3<) zvijM|%95fv*4>&YbT35GmWPK*V9%{%l{I>?&b(i) zHJ#>bYb;N+A4}G|;&{ARQ^{%Mt)N=V6IgQZOEhreSKRs$!N*YSPv$j`=5^ViQOVL2 zWi7U66kd`3>32f&KIMZa4v#9n!tZQHO^L8bCkZSM1P<=qPiFG`8Nqq3kcsK>(v6Q& z?09!u`mR_X98wmJ52w@20@jrQjKHN92B=^d46@*d{> z_Vu;3wX0ymjQ&K^EC>50Z_L2W#DT9NSM}yU5taHwcoheqrP>C*CcZGWYt|cgXX!JZ zIkB=`bKbuE%ESIMx5a?63m2v2W&g@_xI=pv#-Ne)WlQv)t3)E`M3`8DQ%nsa>gp3b zdN+(lnZaIm6q0rNt6hrc=8!nVsQTIOv0UM~A?sy{QcXOM%F@NvI8GLCFgJ>LW!h|V2z!Tcl2;)`P)FldEHX3tpH8wd1eQ~FHJ~r+fIIUxWR?$=g%%AXPt3-Qb;b0JbnL7 z8-GEV5Jg}$lOuWXe4ynX_chGTHYf*6jnt>_Ow@YvK?H-Z6lz&eNgv6!DTpp%z@bLv}p*xiJ*gB>`Gs+FG*&EbDr9{u)-#Zo3 zzB6n*ASOy_t6gn#3kBfuUE&3yKse0ZbnuEfMOKSyn7ug#1uYwP@0Crmzt^3@1PI++&ZeL|WIq#cT?Sg;N1I=23NbXHFb(qa6T7-k8y}_ge zUA4?`-~R1dQ~7UU11w(}80?uU?)LeR=*9bOzlBoo9$judRnPMJT;%bl`PElykC9?U z7wxCz^ChGZ`&(COAx*H)lHMq9ly8%fZze+n2x`m+TZmIO znFOJm&+;Jg)kMaD{n%57_4f8IqZ>mRyX|+f_Q!7`5*2J}9JDGHUP|E^Ek+5OzGQ6B zYU#rB(S@&KJ5AFams|z)lMp(IwXrEn4DbtvP=i#ky*_Fl(F%@dsIk#<<_IF5iVs)m zHP1a%y(P4ur(XGC?UM9IDW;x4XTG1%oD>2*r6;G9rY92xpA;o%x1oOkwi5lb;|W&o zi&=<0p*TRqbz#kxykoTjjSsSYMinOb?jw)dzzV8U!{&*@xR`dT55c7-hzfuzl>xI< zz*jv_SZN2VF$kX#0#}s;Pc%IWdh~-N{}@q`heSMa6eod=C_!RPH=2^$ehW$;7U(p;)~ zFd;go+ZBSMZ@Sm{FQO_RaH1xCx{V%MVoLAT74(F}U+ire&L(ylE%5l*Z)O^Oz>Zu` zWqI=ku{9t*eE&V>?3YL5kK>w*7cN(HFTKAj=(gE3n)ZdxtJ_!Yyhg}h`GklGGxuAn z8J~1l3aJ#g8uv?bF1hn9S$X?SFK)hqzbVXul3liOYU#&FQfWZd96BNeCV1OggV-`p zm!McdeK4f~)yAA=$&$b9G>q1$Zi>|V^-lWO_$e10qK#^Z8*fM$cPU_Gx~tt)>zZp> zBWV>7jj2dqXls?kLG+8vE;DYAnZsh`rissXQdoH?$EK9479s~vZa)36MqBl>L!3$u z3DHPxFcMt~{CM)LBuc~dwb225LWZZ1%YerD(%}vu-0Cp~&u@;VR6Vy-ER-%r& zL5q>zz`(b~a0?B&9LiHc$ZzyP99zIfJ_EbS%zErdXcZxh+$tGwfp5W9^OF1iELVdk zuB$lAsnKysbus5pA;;mK>+H-QQ)azPYJfwuPp37#-@Nb!2Gud#AW-lD9PX5RHCy?0l*a&JWrU8C)!nT98_)+*M)U@f zf9@K6%>$)*n<#tQWe77jh64o7$G3DE6>WSe+qHio{guVWdGF?Vq z7k6~>JFXw(Dtnoy#ph2LnK{2}v86!<0!S`N+yG(pw_xnghyKMiw9lDgtIg1p_hQ?6 z6)BOY{D-@GE4<#ms8JhZcd9W=#zI3q@%^!Mhd;!^aG2N((?BHrB#=p^1iJbkrPcMzz5k{FJDOPmX5qx=|CWaU zup9s9%z_Z*1weR<$3b{SIPYKlWy^hMUwEA7@&kiSYe#oS}S1L*PfJy>^!hH+QpZxo_8K|`U zKkxoupMxhYKi`NO*xF{dv%cd@P$Q*bqyogEEbJkZ205zDmj?$DYsgDsK@>Hnukkfr zfk1JE^RdDwo2q{~Gw|vJVn_Jp1JVw(d+hwp(E}>Njl1yI7NsM+?MWs}_?SDeSAfe4 zDDrg|;PguxVjec`Vh&9@lQw1mrV~_t^58W;xW1|#ShuIkL+MW^K0X|7cV^{0=Tu^p z>pc%4W6TqXq)B5$Y?(=EhM-!=4Gkj(cSz1^pqf#FAFLSF4)8Iz`bww;@x#pk`I12u zT`kBt&%q}pfhK4n{SeG*_6R=Ew^zi&+w;o{3XKpC6|yh7h_OevyGB8fj^e|-T4#M& zaF^O>`u+%X(Aq4O5|Wkj1JmTz9Fl#j{ZYZ&;%Vx&iBkrHoA^QueU#CvPf@uWFuq`k zjSQka%Uyh&>qtjFZvwO&Q53#CR6vzY2(3m?3;#Sst?5yF3L?Qg3yCXoJkhQpiK1HL zYeT9nYQ5k9iVdHcEehzQ;pP;~Tm@1t_f4E!k$cd<)f3#ds!r(8S$tC`m9n^fR%!e& z$ZKMHdisX<+|9|c>A|n>FX0$K{ZnrS49mj7+wfup2xq+sDhD97Q0dBUYN0dOO$on= zR|?4@E75EB2;`1Cqtm~>a7cTG4SjyeoVk!7b*|nU1s0tobjCeIwXDh?x~&+5pIi1q z4q_AlXOS0OUyXq@&4+kwFB0p6R06422yp%e=n)9u9CgieprEFel+t!y^WG5=-r&Wfz77qcq6rD?sB#2Eb>`%BC(@+tB%Nb73VqVVAs>VDrv&-zBO4NJgfVD&Pw zPHlrOiuV0DN%Rw0UyPiRl)U7r7C5-=<7ga@bos>%l-c!KR)4nhcumc6f~}MsJ}}4v zo!f2`x-rk(gM}eFdba1-(wQF4b1&Ddyx?YPgZ&W5WC&wa(n(>K>oQUxUjOeiQwxzf z1PE97DuDV%kVNJo;r63--2!P?B|0-iazPE*ir9?5Prcj(Y4uQ3*Z^VpuaDCE2JZ(( z_MRlof0?&*w zHNt5rSPj;9mvhm&eLu55Q?JbMv9NSbKlS*alCt9yOPS$6zYXBFxsvl<57BR0k%*37 z+T3RRFoQCWG$26E#oAcVU<5UwlDFg1Zs;9#AeU_P>U_V=XjAb|uKhN)*m>^SJRj@1 zmu2KGB(;-&=SLBhXf2giw8;~~NtkY9gGh}oaq20B*pkl^w4B7kxIu}r0Qy^sQ}Pr# z0k>tv$+gX0{!9&-uVr!eTXV2I#RYvg2!@V2sk++#LM-I=@QNH+CPir>R5tM+B2!M2 zf#)qC8e%P{;m3EB@am}?ka#|8K8`AE6o5`6^(oESP5`-(N;`SEze7~?NUwrT=A+MF zg?3L6YHNZreX$W!s64N4t&kTC{Bw<rX zqrXJlFm}ym=lF6Mk^;`sX1CHVO7^{hak^`tMF*ACd@@d zibqW=wz^GnJ`e;txuSbp%e3`Q$bLOi-2(=Id*4U}LJw zifL-8szR`VmL?lLPzutEk=*L8^u300Lypk59=F<_4rg@33YXg$uy!i=Tfsi#raE;A z?7kKP*Zs@G!SrzL*}qs=s!yL_PqfkUKmJ-@M61m6)!h@-Y}v0`#&TM8wX!RxSpywH zYN{XSy>N)W{ldwL#I`$w`b=HMHKOFnL+yTqgyvlCPuXA>e0nn;jYYMs^c6#F+g`}D z24Kw<~Kuk_D#%3*$$f+3%G+QHSK1=8Lmj% zTglxsWx&Kj^7iNE=9sr7jB?c>%B`V$zW(hqGc%#?&c*Jnh5HyOFLhWs*Tb9!-_&dR zHgHd?>po?(E;pC~^U?Sq@~27_rU57&X`G9_JpGG*H;I#9;c}|v5&`6}H(@&x0HOll zv%+>c9~l1?6uQ~ZrE2XuJ)G&@w^g?oZXvs8;gYqXXx-L1T$^Pv#?HXnA4CU8)a63y zXtp0Z&TSF{o<7FM8?y`Y>LWgfm_3AJYQI~V;l@6&Yw}ZmxW)*d3AcV57w4P{5Cw@F z;(;XDr?}b1;wCqN(U&h@!gD9n@kzz|0w9Tm{QUflv10v3FC(B0)3NuIZs&Y!kM!-X&@gQN#r|er zeSd%d#`n(1!K?vNN&&_kWMGjgX8DocJKql8u9(OT-Rj=}!q0wK^cogxGodb8->&Oi zon7ECv9se(TCaea&$`S6mbwept5ycqh#r4x;omcXQ7=rV`!t^dIRw$k_p=;$d z%6cC)@qIky)+9ge^)8UAWALI3PI`Pu#x_Nu&_?Z+{+HtNG>t;57!pi)p;;2mMp-Q68kQp^RjFj7av%#`z4B_m~7UFsa54oagz?@hcQ_de3+L( z!R+m?!)$PJ8Lxqr)0wGtysK_j_<{Xgnbrn@fb8w-<7Ve&+TiMTlE4;hChR3wpY~Q(aXKM{`bPBLps=>aUZGTpnJ76^6xOmIJl{*4Zmc=ST zu`cfu`f7;l=6qr|pSsnyAWa0y8dg191TLEu=Vcif7~D=pd)X>Kf3Up%=teANES0^a zdsfdQ_4JE3jf*YqvZOY_!Pqip8mxm-{-_|^5REe&dm=*MIsX^S5tXB+eGi_36MRyi z?A@a+SlVF#YDrx!Z2yIOirnS5Q)45-MrMICyTWe91N!ZYglFR-qPn_H*vd zxkKCz#QGCYZ_Mi5^nDkXn8Z>z*1&&Srg1J>W9qAM*bbkUio(zLan}pqKi|J+XpOp) zc`U}Ad}(O+z1zOBmD7t=?|m-NfSaPSle=mCgm0!$JZNRm_oi<$EGCpT>b)aYp<-&Z z*K&_g75`V2kM>+lKs>C%xNs?IL7b$r>z_)iF7q<8il6MHGrHajCO1hK^=;+(q?+Jf zJhU_IP<-6_$l3qiS`h|+SrRUZ6FJ}f5Pc8g!w~o)%g-;6D2vVvrEf9r4#+9UI_)w_ z_uV1u*C4;jaq*3h%?8qpY^E2;*jG#K{09ro@B+n_^!6*(g}|cwgVsauK2+w6DBE^L zPnr(WI=b?6Ox)@bjsnKDckHC)yXn+v(icM~#W27;PY4{Yuk&15K%Js%qEc*P)OdMZ zQT?RBD!XlB(z-lYh>M0FKRKn85zRJIMo6Xzgxb2giBqa$_l9d*c0pB3B7^n=;wg&q zNDj0f!frj>cA#Oskf4*{>fb3-+YGjR)tG?fC_Iwua11MX~8)(r%_ z0G+@NrLH-}5bs<1SXw_pTVVL6%zn&l2QE~s292qDA)yvHX|okyO^iyCP#4@RSck21 z0IHy&DhjW-Qz_7tmu@eTqLH-nB@#TW-&pLtZJiw0I^M9Vcx3QBOip52K(MHE!1LOX zt4TBSJSlhEQH=DZ{%J<8?enZ_ePU*_0{S-7y=&XG!kI@0-bBTItpXq=EWUrz6J#TDfQ3D86ehJ8c27&UhtgNhC;%zrB+TOIzx>HL9$hvKE z43|KNLpsxwb3Y6RyWVfyXvsc|L3bJ#A$#*?n*G@l zhq1Z2N9sPVuA(Ynk|!J>S*Db&>C=u0&VUYOm@y9=IqKQj*bM97Ix1zwdRH9vJvcpAI*cA)ed(f`TLJl-0s9BzHxZ zo*oA%?;H?_{kdFKbAUQK2UmFS-%s{`^Da+>1{^z9_V)H9K)WR!Vzaoo*yzOpdSeMG zbhAYl;F**yT+&UgDm`pI>y@@uqn=phcEo$-D;{*6Ik+?z4+@?^?x0TvjRqY`VRGKmO43bo-ZKr>m(E2I=eM%9k|^KItCO1E zy@hZ&W;6Zu5=2Oaphn&gJhSGoY2zx;M2;S+eDmfD=*mbo z8C7MI%BWfcj#F%V%9sR36Tjx7dC9B)T03`=YH{<`OH z1B7nKU4+S(jpG2#~0{#&P*Kc){fB%8E{rZ>t~lFjEGiL=9*ibB&R>manbC%8=6 z!P8U1dSR^**iE4wV?QRXg2e837*1`~NR078PHa<8?l1W;tKe;av{%a@H9P zZGI5OU6z}@Zi4Q(bq(49!;=@OM7wg^Yu4TYIuZ>h#9P%dwTo;wgbuu@`sgd3k0!LX zv`ZhgTm)aleb+yLc>z;_Co$Pdab-~3C!~jFxfC@)A&R%KRz@<^3bRi*AK0vg9P@oK84j%lLW zfO?!9%U*MzWOBJUIj^8v!DMf2JAM(hu+8ZarU4!5fKa#A*2YV1wmYc!)=UljKm!T4`Z!Fu>k!uL1AL!ej&Jwe;DLi9U~Yb+PsnOk|Wj_=YgtT8n3cxx~j3L^4UEU z2oG{-B(%@of|2UxZU1aZ8+g(zw3H6$GegT+ZxVkS>K77{-p}kL8)%Kih$^d_zuao}vFz2|0)u z#H~vOS_0$TzsEc<^(1EY*2dwOXTiKYUjflh3rzKlqpvtg7j=wMw zr%%n}t0GxdcHt}Kud40Zt(7Fr)}=Uj9({V2zZDjq{8sj;>59+G9UAT(alC558=I{X zB-K0uoLNgdN$uL*5ka$k#@V~QKqAVq&u>ar*r&S`Ot`DF($c<`(hubUZE*%lMT~OZ zR{O0_l&ywhH+n}l%!9vIs&!k$`IIlJDl2CY0pksv8pTZ<4goc6W>_WlGW4x@LA4Ac zz-D1GT$sa))<2qNn_FO<16DQ{`#!L7wM6N_XyNE!^!*SCN3r85e&Kt z7~j02=fh{|our#5H-+FD4By@Nqy=o!zSp#!PCf)ZIm0$Eo7{AUA4dcNw5T;H6-&Nz zztlk!20)vT6=be`%?1Dr?#bTQrnFV@&my2ukC{-H-}KPCB;dDE-pAeDy;Iqu`jH#t zj!9Kv0FgX8`apfMEf72HB0stCBdDIh>h4aID7ghE+sf=i> zp~jPT=LA(=G$qrbe!DasM`<{s-Of&bSx8p1n z7B*w;1}f`2ls^<9|C;)iZmd?RPP`scwLNa6(Kuq~CXfxuCfsvMm9i~QddkVD3`p|M zhL~GRU}ou|g~L~-H6zQo9@zQ;N~wLND08;ea;`~jMen)4xRhWaTEs{g!5`ZI-|peF zzfCrC!4!Lmd0AVmO~EBFPZ)7q1)Fr3*-a`B zR9N3N!F=H_^s=&Rl2uk!%_5WSgO`%^fDfqLSU>Z5tSYRTvQq0`xJThRYJ=1.14.2 torch>=0.4.0 -torchvision>=0.1.8 tensorboardX From da901ed5b092bda93c73fe3a85d753ba5da04b96 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Fri, 30 Nov 2018 23:56:44 +0800 Subject: [PATCH 02/67] * DataSet __getitem__ returns copy of Instance * refine interface of set_target & set_input * rename DataSet.Instance into DataSet.DataSetIter * remove unused methods in DataSet.DataSetIter * remove __setattr__ in DataSet; It is dangerous. * comment adjustment --- fastNLP/core/dataset.py | 174 +++++++++++++++----------------------- test/core/test_dataset.py | 5 +- 2 files changed, 70 insertions(+), 109 deletions(-) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 8583b95b..920e9f11 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -1,5 +1,4 @@ import numpy as np -from copy import copy from fastNLP.core.fieldarray import FieldArray from fastNLP.core.instance import Instance @@ -28,38 +27,22 @@ class DataSet(object): """ - class Instance(object): - def __init__(self, dataset, idx=-1, **fields): - self.dataset = dataset + class DataSetIter(object): + def __init__(self, data_set, idx=-1, **fields): + self.data_set = data_set self.idx = idx self.fields = fields def __next__(self): self.idx += 1 - if self.idx >= len(self.dataset): + if self.idx >= len(self.data_set): raise StopIteration - return copy(self) - - def add_field(self, field_name, field): - """Add a new field to the instance. - - :param field_name: str, the name of the field. - :param field: - """ - self.fields[field_name] = field - - def __getitem__(self, name): - return self.dataset[name][self.idx] - - def __setitem__(self, name, val): - if name not in self.dataset: - new_fields = [None] * len(self.dataset) - self.dataset.add_field(name, new_fields) - self.dataset[name][self.idx] = val + # this returns a copy + return self.data_set[self.idx] def __repr__(self): - return "\n".join(['{}: {}'.format(name, repr(self.dataset[name][self.idx])) for name - in self.dataset.get_fields().keys()]) + return "\n".join(['{}: {}'.format(name, repr(self.data_set[name][self.idx])) for name + in self.data_set.get_fields().keys()]) def __init__(self, data=None): """ @@ -89,14 +72,41 @@ def __contains__(self, item): return item in self.field_arrays def __iter__(self): - return self.Instance(self) + return self.DataSetIter(self) - def _convert_ins(self, ins_list): - if isinstance(ins_list, list): - for ins in ins_list: - self.append(ins) + def __getitem__(self, idx): + """Fetch Instance(s) at the `idx` position(s) in the dataset. + Notice: This method returns a copy of the actual instance(s). Any change to the returned value would not modify + the origin instance(s) of the DataSet. + If you want to make in-place changes to all Instances, use `apply` method. + + :param idx: can be int or slice. + :return: If `idx` is int, return an Instance object. + If `idx` is slice, return a DataSet object. + """ + if isinstance(idx, int): + return Instance(**{name: self.field_arrays[name][idx] for name in self.field_arrays}) + elif isinstance(idx, slice): + data_set = DataSet() + for field in self.field_arrays.values(): + data_set.add_field(name=field.name, + fields=field.content[idx], + padding_val=field.padding_val, + is_input=field.is_input, + is_target=field.is_target) + return data_set else: - self.append(ins_list) + raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx))) + + def __len__(self): + """Fetch the length of the dataset. + + :return int length: + """ + if len(self.field_arrays) == 0: + return 0 + field = iter(self.field_arrays.values()).__next__() + return len(field) def append(self, ins): """Add an instance to the DataSet. @@ -143,72 +153,47 @@ def get_fields(self): """ return self.field_arrays - def __getitem__(self, idx): - """ - - :param idx: can be int, slice, or str. - :return: If `idx` is int, return an Instance object. - If `idx` is slice, return a DataSet object. - If `idx` is str, it must be a field name, return the field. - - """ - if isinstance(idx, int): - return self.Instance(self, idx, **{name: self.field_arrays[name][idx] for name in self.field_arrays}) - elif isinstance(idx, slice): - data_set = DataSet() - for field in self.field_arrays.values(): - data_set.add_field(name=field.name, - fields=field.content[idx], - padding_val=field.padding_val, - is_input=field.is_input, - is_target=field.is_target) - return data_set - elif isinstance(idx, str): - return self.field_arrays[idx] - else: - raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx))) - - def __len__(self): - if len(self.field_arrays) == 0: - return 0 - field = iter(self.field_arrays.values()).__next__() - return len(field) - def get_length(self): - """The same as __len__ + """Fetch the length of the dataset. + :return int length: """ return len(self) def rename_field(self, old_name, new_name): - """rename a field + """Rename a field. + + :param str old_name: + :param str new_name: """ if old_name in self.field_arrays: self.field_arrays[new_name] = self.field_arrays.pop(old_name) else: raise KeyError("{} is not a valid name. ".format(old_name)) - def set_target(self, **fields): - """Change the flag of `is_target` for all instance. For fields not set here, leave their `is_target` unchanged. + def set_target(self, *field_names, flag=True): + """Change the target flag of these fields. - :param key-value pairs for field-name and `is_target` value(True, False). + :param field_names: a sequence of str, indicating field names + :param bool flag: Set these fields as target if True. Unset them if False. """ - for name, val in fields.items(): + for name in field_names: if name in self.field_arrays: - assert isinstance(val, bool) - self.field_arrays[name].is_target = val + self.field_arrays[name].is_target = flag else: raise KeyError("{} is not a valid field name.".format(name)) - return self - def set_input(self, **fields): - for name, val in fields.items(): + def set_input(self, *field_name, flag=True): + """Set the input flag of these fields. + + :param field_name: a sequence of str, indicating field names. + :param bool flag: Set these fields as input if True. Unset them if False. + """ + for name in field_name: if name in self.field_arrays: - assert isinstance(val, bool) - self.field_arrays[name].is_input = val + self.field_arrays[name].is_input = flag else: raise KeyError("{} is not a valid field name.".format(name)) - return self def get_input_name(self): return [name for name, field in self.field_arrays.items() if field.is_input] @@ -216,27 +201,6 @@ def get_input_name(self): def get_target_name(self): return [name for name, field in self.field_arrays.items() if field.is_target] - def __getattr__(self, item): - # block infinite recursion for copy, pickle - if item == '__setstate__': - raise AttributeError(item) - try: - return self.field_arrays.__getitem__(item) - except KeyError: - pass - try: - reader_cls = _READERS[item] - - # add read_*data() support - def _read(*args, **kwargs): - data = reader_cls().load(*args, **kwargs) - self.extend(data) - return self - - return _read - except KeyError: - raise AttributeError('{} does not exist.'.format(item)) - @classmethod def set_reader(cls, method_name): """decorator to add dataloader support @@ -275,7 +239,6 @@ def drop(self, func): results = [ins for ins in self if not func(ins)] for name, old_field in self.field_arrays.items(): self.field_arrays[name].content = [ins[name] for ins in results] - # print(self.field_arrays[name]) def split(self, dev_ratio): """Split the dataset into training and development(validation) set. @@ -300,27 +263,28 @@ def split(self, dev_ratio): return train_set, dev_set @classmethod - def read_csv(cls, csv_path, headers=None, sep='\t', dropna=True): - with open(csv_path, 'r') as f: + def read_csv(cls, csv_path, headers=None, sep=",", dropna=True): + with open(csv_path, "r") as f: start_idx = 0 if headers is None: headers = f.readline().rstrip('\r\n') headers = headers.split(sep) start_idx += 1 else: - assert isinstance(headers, (list, tuple)), "headers should be list or tuple, not {}.".format(type(headers)) + assert isinstance(headers, (list, tuple)), "headers should be list or tuple, not {}.".format( + type(headers)) _dict = {} for col in headers: _dict[col] = [] for line_idx, line in enumerate(f, start_idx): contents = line.split(sep) - if len(contents)!=len(headers): + if len(contents) != len(headers): if dropna: continue else: - #TODO change error type - raise ValueError("Line {} has {} parts, while header has {} parts."\ - .format(line_idx, len(contents), len(headers))) + # TODO change error type + raise ValueError("Line {} has {} parts, while header has {} parts." \ + .format(line_idx, len(contents), len(headers))) for header, content in zip(headers, contents): _dict[header].append(content) return cls(_dict) diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index b985b253..786e7248 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -55,7 +55,7 @@ def test_delete_field(self): def test_getitem(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ins_1, ins_0 = ds[0], ds[1] - self.assertTrue(isinstance(ins_1, DataSet.Instance) and isinstance(ins_0, DataSet.Instance)) + self.assertTrue(isinstance(ins_1, Instance) and isinstance(ins_0, Instance)) self.assertEqual(ins_1["x"], [1, 2, 3, 4]) self.assertEqual(ins_1["y"], [5, 6]) self.assertEqual(ins_0["x"], [1, 2, 3, 4]) @@ -65,9 +65,6 @@ def test_getitem(self): self.assertTrue(isinstance(sub_ds, DataSet)) self.assertEqual(len(sub_ds), 10) - field = ds["x"] - self.assertEqual(field, ds.field_arrays["x"]) - def test_apply(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ds.apply(lambda ins: ins["x"][::-1], new_field_name="rx") From 6839bb91cceaf4bf868f2d89a507febdbf08962e Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sat, 1 Dec 2018 10:38:01 +0800 Subject: [PATCH 03/67] Add auto type detection/conversion in FieldArray * In init, detect content type to be Python int, float, or str. * In append(), check type consistence. * In init & append(), int will be cast into float if they occur together. * Map Python type into numpy dtype * Raise error if type detection fails. --- fastNLP/core/fieldarray.py | 67 ++++++++++++++++++++++++++++-------- test/core/test_fieldarray.py | 20 +++++++++++ 2 files changed, 72 insertions(+), 15 deletions(-) diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index 3a63f788..f93fbf2e 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -6,6 +6,7 @@ class FieldArray(object): It is the basic element of DataSet class. """ + def __init__(self, name, content, padding_val=0, is_target=False, is_input=False): """ @@ -20,21 +21,56 @@ def __init__(self, name, content, padding_val=0, is_target=False, is_input=False self.padding_val = padding_val self.is_target = is_target self.is_input = is_input - # TODO: auto detect dtype - self.dtype = None + self.pytype = self._type_detection(content) + self.dtype = self._map_to_np_type(self.pytype) + + @staticmethod + def _type_detection(content): + type_set = set([type(item) for item in content]) + if len(type_set) == 1 and any(basic_type in type_set for basic_type in (str, int, float)): + return type_set.pop() + elif len(type_set) == 2 and float in type_set and int in type_set: + # up-cast int to float + for idx, _ in enumerate(content): + content[idx] = float(content[idx]) + return float + else: + raise ValueError("Unsupported type conversion detected in FieldArray: {}".format(*type_set)) + + @staticmethod + def _map_to_np_type(basic_type): + type_mapping = {int: np.int64, float: np.double, str: np.str} + return type_mapping[basic_type] def __repr__(self): return "FieldArray {}: {}".format(self.name, self.content.__repr__()) def append(self, val): + """Add a new item to the tail of FieldArray. + + :param val: int, float, or str. + """ + val_type = type(val) + if val_type is int and self.pytype is float: + # up-cast the appended value + val = float(val) + elif val_type is float and self.pytype is int: + # up-cast all other values in the content + for idx, _ in enumerate(self.content): + self.content[idx] = float(self.content[idx]) + self.pytype = float + self.dtype = self._map_to_np_type(self.pytype) + + elif val_type != self.pytype: + raise ValueError("Cannot append a {}-type value into a {}-tpye FieldArray.".format(val_type, self.pytype)) self.content.append(val) - def __getitem__(self, name): - return self.get(name) + def __getitem__(self, indices): + return self.get(indices) - def __setitem__(self, name, val): - assert isinstance(name, int) - self.content[name] = val + def __setitem__(self, idx, val): + assert isinstance(idx, int) + self.content[idx] = val def get(self, indices): """Fetch instances based on indices. @@ -42,31 +78,32 @@ def get(self, indices): :param indices: an int, or a list of int. :return: """ + # TODO: 返回行为不一致,有隐患 if isinstance(indices, int): return self.content[indices] assert self.is_input is True or self.is_target is True batch_size = len(indices) # TODO 当这个fieldArray是seq_length这种只有一位的内容时,不需要padding,需要再讨论一下 - if not isiterable(self.content[0]): - if self.dtype is None: - self.dtype = np.int64 if isinstance(self.content[0], int) else np.double + if not is_iterable(self.content[0]): array = np.array([self.content[i] for i in indices], dtype=self.dtype) else: - if self.dtype is None: - self.dtype = np.int64 max_len = max([len(self.content[i]) for i in indices]) array = np.full((batch_size, max_len), self.padding_val, dtype=self.dtype) - for i, idx in enumerate(indices): array[i][:len(self.content[idx])] = self.content[idx] return array def __len__(self): + """Returns the size of FieldArray. + + :return int length: + """ return len(self.content) -def isiterable(content): + +def is_iterable(content): try: _ = (e for e in content) except TypeError: return False - return True \ No newline at end of file + return True diff --git a/test/core/test_fieldarray.py b/test/core/test_fieldarray.py index 07f02c54..883e1136 100644 --- a/test/core/test_fieldarray.py +++ b/test/core/test_fieldarray.py @@ -20,3 +20,23 @@ def test(self): self.assertEqual(fa.get(0), 1) self.assertTrue(isinstance(fa.get([0, 1, 2]), np.ndarray)) self.assertListEqual(list(fa.get([0, 1, 2])), [1, 2, 3]) + + def test_type_conversion(self): + fa = FieldArray("x", [1.2, 2.2, 3, 4, 5], is_input=True) + self.assertEqual(fa.pytype, float) + self.assertEqual(fa.dtype, np.double) + + fa = FieldArray("x", [1, 2, 3, 4, 5], is_input=True) + fa.append(1.3333) + self.assertEqual(fa.pytype, float) + self.assertEqual(fa.dtype, np.double) + + fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=False) + fa.append(10) + self.assertEqual(fa.pytype, float) + self.assertEqual(fa.dtype, np.double) + + fa = FieldArray("y", ["a", "b", "c", "d"], is_input=False) + fa.append("e") + self.assertEqual(fa.dtype, np.str) + self.assertEqual(fa.pytype, str) From 07e227aa4dd05004856c44211dd67f5ca961295a Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sat, 1 Dec 2018 14:57:49 +0800 Subject: [PATCH 04/67] add interface of Loss --- fastNLP/core/__init__.py | 4 +- fastNLP/core/loss.py | 196 ----------------------------------- fastNLP/core/losses.py | 219 +++++++++++++++++++++++++++++++++++++++ fastNLP/core/trainer.py | 16 ++- test/core/test_loss.py | 7 +- 5 files changed, 232 insertions(+), 210 deletions(-) delete mode 100644 fastNLP/core/loss.py create mode 100644 fastNLP/core/losses.py diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index 1003c824..dfe35f77 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -2,10 +2,10 @@ from .dataset import DataSet from .fieldarray import FieldArray from .instance import Instance +from .losses import Loss from .metrics import Evaluator, ClassifyEvaluator, SNLIEvaluator, SeqLabelEvaluator +from .optimizer import Optimizer from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler from .tester import Tester from .trainer import Trainer from .vocabulary import Vocabulary -from .optimizer import Optimizer -from .loss import Loss diff --git a/fastNLP/core/loss.py b/fastNLP/core/loss.py deleted file mode 100644 index 093b3b96..00000000 --- a/fastNLP/core/loss.py +++ /dev/null @@ -1,196 +0,0 @@ -import torch - -def squash(predict , truth , **kwargs): - '''To reshape tensors in order to fit Loss functions in pytorch - - :param predict : Tensor, model output - :param truth : Tensor, truth from dataset - :param **kwargs : extra arguments - - :return predict , truth: predict & truth after processing - ''' - return predict.view(-1 , predict.size()[-1]) , truth.view(-1,) - -def unpad(predict , truth , **kwargs): - '''To process padded sequence output to get true loss - Using pack_padded_sequence() method - This method contains squash() - - :param predict : Tensor, [batch_size , max_len , tag_size] - :param truth : Tensor, [batch_size , max_len] - :param **kwargs : extra arguments, kwargs["lens"] is expected to be exsist - kwargs["lens"] : list or LongTensor, [batch_size] - the i-th element is true lengths of i-th sequence - - :return predict , truth: predict & truth after processing - ''' - if kwargs.get("lens") is None: - return predict , truth - lens = torch.LongTensor(kwargs["lens"]) - lens , idx = torch.sort(lens , descending = True) - predict = torch.nn.utils.rnn.pack_padded_sequence(predict[idx] , lens , batch_first = True).data - truth = torch.nn.utils.rnn.pack_padded_sequence(truth[idx] , lens , batch_first = True).data - return predict , truth - -def unpad_mask(predict , truth , **kwargs): - '''To process padded sequence output to get true loss - Using mask() method - This method contains squash() - - :param predict : Tensor, [batch_size , max_len , tag_size] - :param truth : Tensor, [batch_size , max_len] - :param **kwargs : extra arguments, kwargs["lens"] is expected to be exsist - kwargs["lens"] : list or LongTensor, [batch_size] - the i-th element is true lengths of i-th sequence - - :return predict , truth: predict & truth after processing - ''' - if kwargs.get("lens") is None: - return predict , truth - mas = make_mask(kwargs["lens"] , truth.size()[1]) - return mask(predict , truth , mask = mas) - -def mask(predict , truth , **kwargs): - '''To select specific elements from Tensor - This method contains squash() - - :param predict : Tensor, [batch_size , max_len , tag_size] - :param truth : Tensor, [batch_size , max_len] - :param **kwargs : extra arguments, kwargs["mask"] is expected to be exsist - kwargs["mask"] : ByteTensor, [batch_size , max_len] - the mask Tensor , the position that is 1 will be selected - - :return predict , truth: predict & truth after processing - ''' - if kwargs.get("mask") is None: - return predict , truth - mask = kwargs["mask"] - - predict , truth = squash(predict , truth) - mask = mask.view(-1,) - - predict = torch.masked_select(predict.permute(1,0) , mask).view(predict.size()[-1] , -1).permute(1,0) - truth = torch.masked_select(truth , mask) - - return predict , truth - -def make_mask(lens , tar_len): - '''to generate a mask that select [:lens[i]] for i-th element - embezzle from fastNLP.models.sequence_modeling.seq_mask - - :param lens : list or LongTensor, [batch_size] - :param tar_len : int - - :return mask : ByteTensor - ''' - lens = torch.LongTensor(lens) - mask = [torch.ge(lens, i + 1) for i in range(tar_len)] - mask = torch.stack(mask, 1) - return mask - -#map string to function. Just for more elegant using -method_dict = { - "squash" : squash, - "unpad" : unpad, - "unpad_mask" : unpad_mask, - "mask" : mask, -} - -loss_function_name = { - "L1Loss".lower() : torch.nn.L1Loss, - "BCELoss".lower() : torch.nn.BCELoss, - "MSELoss".lower() : torch.nn.MSELoss, - "NLLLoss".lower() : torch.nn.NLLLoss, - "KLDivLoss".lower() : torch.nn.KLDivLoss, - "NLLLoss2dLoss".lower() : torch.nn.NLLLoss2d, #every name should end with "loss" - "SmoothL1Loss".lower() : torch.nn.SmoothL1Loss, - "SoftMarginLoss".lower() : torch.nn.SoftMarginLoss, - "PoissonNLLLoss".lower() : torch.nn.PoissonNLLLoss, - "MultiMarginLoss".lower() : torch.nn.MultiMarginLoss, - "CrossEntropyLoss".lower() : torch.nn.CrossEntropyLoss, - "BCEWithLogitsLoss".lower() : torch.nn.BCEWithLogitsLoss, - "MarginRankingLoss".lower() : torch.nn.MarginRankingLoss, - "TripletMarginLoss".lower() : torch.nn.TripletMarginLoss, - "HingeEmbeddingLoss".lower() : torch.nn.HingeEmbeddingLoss, - "CosineEmbeddingLoss".lower() : torch.nn.CosineEmbeddingLoss, - "MultiLabelMarginLoss".lower() : torch.nn.MultiLabelMarginLoss, - "MultiLabelSoftMarginLoss".lower() : torch.nn.MultiLabelSoftMarginLoss, -} - -class Loss(object): - '''a Loss object is a callable object represents loss functions - ''' - - def __init__(self , loss_name , pre_pro = [squash], **kwargs): - ''' - - :param loss_name: str or None , the name of loss function - :param pre_pro : list of function or str, methods to reform parameters before calculating loss - the strings will be auto translated to pre-defined functions - :param **kwargs: kwargs for torch loss function - - pre_pro funcsions should have three arguments: predict, truth, **arg - predict and truth is the necessary parameters in loss function - kwargs is the extra parameters passed-in when calling loss function - pre_pro functions should return two objects, respectively predict and truth that after processed - - ''' - - if loss_name is None: - # this is useful when Trainer.__init__ performs type check - self._loss = None - else: - if not isinstance(loss_name, str): - raise NotImplementedError - else: - self._loss = self._get_loss(loss_name , **kwargs) - - self.pre_pro = [f if callable(f) else method_dict.get(f) for f in pre_pro] - - def add_pre_pro(self , func): - '''add a pre_pro function - - :param func: a function or str, methods to reform parameters before calculating loss - the strings will be auto translated to pre-defined functions - ''' - if not callable(func): - func = method_dict.get(func) - if func is None: - return - self.pre_pro.append(func) - - @staticmethod - def _get_loss(loss_name , **kwargs): - '''Get loss function from torch - - :param loss_name: str, the name of loss function - :param **kwargs: kwargs for torch loss function - :return: A callable loss function object - ''' - loss_name = loss_name.strip().lower() - loss_name = "".join(loss_name.split("_")) - - if len(loss_name) < 4 or loss_name[-4 : ] != "loss": - loss_name += "loss" - return loss_function_name[loss_name](**kwargs) - - def get(self): - '''This method exists just for make some existing codes run error-freely - ''' - return self - - def __call__(self , predict , truth , **kwargs): - '''call a loss function - predict and truth will be processed by pre_pro methods in order of addition - - :param predict : Tensor, model output - :param truth : Tensor, truth from dataset - :param **kwargs : extra arguments, pass to pre_pro functions - for example, if used unpad_mask() in pre_pro, there should be a kwarg named lens - ''' - for f in self.pre_pro: - if f is None: - continue - predict , truth = f(predict , truth , **kwargs) - - return self._loss(predict , truth) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py new file mode 100644 index 00000000..1e5a4914 --- /dev/null +++ b/fastNLP/core/losses.py @@ -0,0 +1,219 @@ +import torch + + +class LossBase(object): + def __init__(self): + self.param_map = {} + + def get_loss(self, *args, **kwargs): + raise NotImplementedError + + def __call__(self, output_dict, predict_dict): + pass + + +class Loss(LossBase): + def __init__(self): + pass + + +def squash(predict, truth, **kwargs): + '''To reshape tensors in order to fit Loss functions in pytorch + + :param predict : Tensor, model output + :param truth : Tensor, truth from dataset + :param **kwargs : extra arguments + + :return predict , truth: predict & truth after processing + ''' + return predict.view(-1, predict.size()[-1]), truth.view(-1, ) + + +def unpad(predict, truth, **kwargs): + '''To process padded sequence output to get true loss + Using pack_padded_sequence() method + This method contains squash() + + :param predict : Tensor, [batch_size , max_len , tag_size] + :param truth : Tensor, [batch_size , max_len] + :param **kwargs : extra arguments, kwargs["lens"] is expected to be exsist + kwargs["lens"] : list or LongTensor, [batch_size] + the i-th element is true lengths of i-th sequence + + :return predict , truth: predict & truth after processing + ''' + if kwargs.get("lens") is None: + return predict, truth + lens = torch.LongTensor(kwargs["lens"]) + lens, idx = torch.sort(lens, descending=True) + predict = torch.nn.utils.rnn.pack_padded_sequence(predict[idx], lens, batch_first=True).data + truth = torch.nn.utils.rnn.pack_padded_sequence(truth[idx], lens, batch_first=True).data + return predict, truth + + +def unpad_mask(predict, truth, **kwargs): + '''To process padded sequence output to get true loss + Using mask() method + This method contains squash() + + :param predict : Tensor, [batch_size , max_len , tag_size] + :param truth : Tensor, [batch_size , max_len] + :param **kwargs : extra arguments, kwargs["lens"] is expected to be exsist + kwargs["lens"] : list or LongTensor, [batch_size] + the i-th element is true lengths of i-th sequence + + :return predict , truth: predict & truth after processing + ''' + if kwargs.get("lens") is None: + return predict, truth + mas = make_mask(kwargs["lens"], truth.size()[1]) + return mask(predict, truth, mask=mas) + + +def mask(predict, truth, **kwargs): + '''To select specific elements from Tensor + This method contains squash() + + :param predict : Tensor, [batch_size , max_len , tag_size] + :param truth : Tensor, [batch_size , max_len] + :param **kwargs : extra arguments, kwargs["mask"] is expected to be exsist + kwargs["mask"] : ByteTensor, [batch_size , max_len] + the mask Tensor , the position that is 1 will be selected + + :return predict , truth: predict & truth after processing + ''' + if kwargs.get("mask") is None: + return predict, truth + mask = kwargs["mask"] + + predict, truth = squash(predict, truth) + mask = mask.view(-1, ) + + predict = torch.masked_select(predict.permute(1, 0), mask).view(predict.size()[-1], -1).permute(1, 0) + truth = torch.masked_select(truth, mask) + + return predict, truth + + +def make_mask(lens, tar_len): + '''to generate a mask that select [:lens[i]] for i-th element + embezzle from fastNLP.models.sequence_modeling.seq_mask + + :param lens : list or LongTensor, [batch_size] + :param tar_len : int + + :return mask : ByteTensor + ''' + lens = torch.LongTensor(lens) + mask = [torch.ge(lens, i + 1) for i in range(tar_len)] + mask = torch.stack(mask, 1) + return mask + + +# map string to function. Just for more elegant using +method_dict = { + "squash": squash, + "unpad": unpad, + "unpad_mask": unpad_mask, + "mask": mask, +} + +loss_function_name = { + "L1Loss".lower(): torch.nn.L1Loss, + "BCELoss".lower(): torch.nn.BCELoss, + "MSELoss".lower(): torch.nn.MSELoss, + "NLLLoss".lower(): torch.nn.NLLLoss, + "KLDivLoss".lower(): torch.nn.KLDivLoss, + "NLLLoss2dLoss".lower(): torch.nn.NLLLoss2d, # every name should end with "loss" + "SmoothL1Loss".lower(): torch.nn.SmoothL1Loss, + "SoftMarginLoss".lower(): torch.nn.SoftMarginLoss, + "PoissonNLLLoss".lower(): torch.nn.PoissonNLLLoss, + "MultiMarginLoss".lower(): torch.nn.MultiMarginLoss, + "CrossEntropyLoss".lower(): torch.nn.CrossEntropyLoss, + "BCEWithLogitsLoss".lower(): torch.nn.BCEWithLogitsLoss, + "MarginRankingLoss".lower(): torch.nn.MarginRankingLoss, + "TripletMarginLoss".lower(): torch.nn.TripletMarginLoss, + "HingeEmbeddingLoss".lower(): torch.nn.HingeEmbeddingLoss, + "CosineEmbeddingLoss".lower(): torch.nn.CosineEmbeddingLoss, + "MultiLabelMarginLoss".lower(): torch.nn.MultiLabelMarginLoss, + "MultiLabelSoftMarginLoss".lower(): torch.nn.MultiLabelSoftMarginLoss, +} + + +class Loss(object): + '''a Loss object is a callable object represents loss functions + ''' + + def __init__(self, loss_name, pre_pro=[squash], **kwargs): + ''' + + :param loss_name: str or None , the name of loss function + :param pre_pro : list of function or str, methods to reform parameters before calculating loss + the strings will be auto translated to pre-defined functions + :param **kwargs: kwargs for torch loss function + + pre_pro funcsions should have three arguments: predict, truth, **arg + predict and truth is the necessary parameters in loss function + kwargs is the extra parameters passed-in when calling loss function + pre_pro functions should return two objects, respectively predict and truth that after processed + + ''' + + if loss_name is None: + # this is useful when Trainer.__init__ performs type check + self._loss = None + else: + if not isinstance(loss_name, str): + raise NotImplementedError + else: + self._loss = self._get_loss(loss_name, **kwargs) + + self.pre_pro = [f if callable(f) else method_dict.get(f) for f in pre_pro] + + def add_pre_pro(self, func): + '''add a pre_pro function + + :param func: a function or str, methods to reform parameters before calculating loss + the strings will be auto translated to pre-defined functions + ''' + if not callable(func): + func = method_dict.get(func) + if func is None: + return + self.pre_pro.append(func) + + @staticmethod + def _get_loss(loss_name, **kwargs): + '''Get loss function from torch + + :param loss_name: str, the name of loss function + :param **kwargs: kwargs for torch loss function + :return: A callable loss function object + ''' + loss_name = loss_name.strip().lower() + loss_name = "".join(loss_name.split("_")) + + if len(loss_name) < 4 or loss_name[-4:] != "loss": + loss_name += "loss" + return loss_function_name[loss_name](**kwargs) + + def get(self): + '''This method exists just for make some existing codes run error-freely + ''' + return self + + def __call__(self, predict, truth, **kwargs): + '''call a loss function + predict and truth will be processed by pre_pro methods in order of addition + + :param predict : Tensor, model output + :param truth : Tensor, truth from dataset + :param **kwargs : extra arguments, pass to pre_pro functions + for example, if used unpad_mask() in pre_pro, there should be a kwarg named lens + ''' + for f in self.pre_pro: + if f is None: + continue + predict, truth = f(predict, truth, **kwargs) + + return self._loss(predict, truth) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 6b0398b5..26362cb9 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -1,27 +1,25 @@ +import itertools +import os import time -from datetime import timedelta -from datetime import datetime import warnings from collections import defaultdict -import os -import itertools -import shutil +from datetime import datetime +from datetime import timedelta -from tensorboardX import SummaryWriter import torch +from tensorboardX import SummaryWriter from fastNLP.core.batch import Batch -from fastNLP.core.loss import Loss -from fastNLP.core.metrics import Evaluator from fastNLP.core.optimizer import Optimizer from fastNLP.core.sampler import RandomSampler from fastNLP.core.sampler import SequentialSampler from fastNLP.core.tester import Tester -from fastNLP.core.utils import _check_arg_dict_list from fastNLP.core.utils import _build_args +from fastNLP.core.utils import _check_arg_dict_list from fastNLP.core.utils import _syn_model_data from fastNLP.core.utils import get_func_signature + class Trainer(object): """Main Training Loop diff --git a/test/core/test_loss.py b/test/core/test_loss.py index d45d54e3..fdde4f0e 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -1,9 +1,10 @@ +import math import unittest -import fastNLP.core.loss as loss -import math import torch as tc -import pdb + +import fastNLP.core.losses as loss + class TestLoss(unittest.TestCase): From 3d91f2f024207c8bfc0dae62cdaead227f4558c7 Mon Sep 17 00:00:00 2001 From: yh Date: Sat, 1 Dec 2018 15:00:06 +0800 Subject: [PATCH 05/67] =?UTF-8?q?trainer=E8=BF=AD=E4=BB=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/tester.py | 18 ++++--- fastNLP/core/trainer.py | 117 +++++++++++++++++++++++++++------------- fastNLP/core/utils.py | 63 ++++++++++++++++++++-- 3 files changed, 148 insertions(+), 50 deletions(-) diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index ee1354fe..5d264b80 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -6,33 +6,34 @@ from fastNLP.core.batch import Batch from fastNLP.core.sampler import RandomSampler from fastNLP.core.utils import _build_args +from fastNLP.core.utils import get_func_signature class Tester(object): """An collection of model inference and evaluation of performance, used over validation/dev set and test set. """ - def __init__(self, data, model, batch_size=16, use_cuda=False): + def __init__(self, data, model, metrics, batch_size=16, use_cuda=False, verbose=0): super(Tester, self).__init__() self.use_cuda = use_cuda self.data = data self.batch_size = batch_size + self.verbose = verbose if torch.cuda.is_available() and self.use_cuda: self._model = model.cuda() else: self._model = model if hasattr(self._model, 'predict'): - assert callable(self._model.predict) + if not callable(self._model.predict): + raise TypeError(f"{get_func_signature(model.predict)} must be callable to be used " + f"for evaluation.") self._predict_func = self._model.predict else: self._predict_func = self._model - assert hasattr(model, 'evaluate') - self._evaluator = model.evaluate - self.eval_history = [] # evaluation results of all batches + def test(self): # turn on the testing mode; clean up the history network = self._model self.mode(network, is_test=True) - self.eval_history.clear() output, truths = defaultdict(list), defaultdict(list) data_iterator = Batch(self.data, self.batch_size, sampler=RandomSampler(), as_numpy=False) @@ -48,9 +49,10 @@ def test(self): output[k] = itertools.chain(*v) for k, v in truths.items(): truths[k] = itertools.chain(*v) - args = _build_args(self._evaluator, **output, **truths) + # args = _build_args(self._evaluator, **output, **truths) eval_results = self._evaluator(**args) - print("[tester] {}".format(self.print_eval_results(eval_results))) + if self.verbose >= 0: + print("[tester] {}".format(self.print_eval_results(eval_results))) self.mode(network, is_test=False) return eval_results diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 6b0398b5..63eb963e 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -9,6 +9,7 @@ from tensorboardX import SummaryWriter import torch +from torch import nn from fastNLP.core.batch import Batch from fastNLP.core.loss import Loss @@ -21,12 +22,13 @@ from fastNLP.core.utils import _build_args from fastNLP.core.utils import _syn_model_data from fastNLP.core.utils import get_func_signature +from fastNLP.core.dataset import DataSet class Trainer(object): """Main Training Loop """ - def __init__(self, train_data, model, n_epochs=3, batch_size=32, print_every=-1, validate_every=-1, + def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, print_every=-1, validate_every=-1, dev_data=None, use_cuda=False, save_path="./save", optimizer=Optimizer("Adam", lr=0.01, weight_decay=0), need_check_code=True, **kwargs): @@ -35,6 +37,8 @@ def __init__(self, train_data, model, n_epochs=3, batch_size=32, print_every=-1, self.train_data = train_data self.dev_data = dev_data # If None, No validation. self.model = model + self.losser = losser + self.metrics = metrics self.n_epochs = int(n_epochs) self.batch_size = int(batch_size) self.use_cuda = bool(use_cuda) @@ -43,23 +47,22 @@ def __init__(self, train_data, model, n_epochs=3, batch_size=32, print_every=-1, self.validate_every = int(validate_every) self._best_accuracy = 0 - if need_check_code: - _check_code(dataset=train_data, model=model, dev_data=dev_data) - model_name = model.__class__.__name__ - assert hasattr(self.model, 'get_loss'), "model {} has to have a 'get_loss' function.".format(model_name) - self.loss_func = self.model.get_loss + # TODO check loss与metrics的类型 + + + + # TODO self._best_accuracy不能表现出当前的metric多种的情况 + if isinstance(optimizer, torch.optim.Optimizer): self.optimizer = optimizer else: self.optimizer = optimizer.construct_from_pytorch(self.model.parameters()) - assert hasattr(self.model, 'evaluate'), "model {} has to have a 'evaluate' function.".format(model_name) - self.evaluator = self.model.evaluate - if self.dev_data is not None: self.tester = Tester(model=self.model, data=self.dev_data, + metrics=self.metrics, batch_size=self.batch_size, use_cuda=self.use_cuda) @@ -71,6 +74,38 @@ def __init__(self, train_data, model, n_epochs=3, batch_size=32, print_every=-1, # print(self.__dict__) + def _check_params(self, train_data, model, losser, metrics=[], n_epochs=3, batch_size=32, print_every=-1, + validate_every=-1, dev_data=None, use_cuda=False, save_path="./save", + optimizer=Optimizer("Adam", lr=0.01, weight_decay=0), need_check_code=True, + **kwargs): + if not isinstance(train_data, DataSet): + raise TypeError("The type of train_data must be fastNLP.DataSet, got {}.".\ + format(type(train_data))) + if not isinstance(model, nn.Module): + raise TypeError("The type of model must be torch.nn.Module, got {}.".\ + format(type(model))) + if losser is not None: + # TODO change + if not isinstance(losser, None): + raise TypeError("The type of losser must be xxx, got {}.".\ + format(type(losser))) + + # check metrics and dev_data + if (not metrics) and dev_data is not None: + raise ValueError("No metric for dev_data evaluation.") + if metrics and (dev_data is None): + raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ") + + # check loss + if isinstance(losser, type): + self.losser = losser() + if not isinstance(self.losser, None): + raise TypeError(f'The type of losser must be `{}`, got {type(self.losser)}.') + + if need_check_code: + _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data) + + def train(self): """Start Training. @@ -171,6 +206,9 @@ def update(self): def data_forward(self, network, x): x = _build_args(network.forward, **x) y = network(**x) + if not isinstance(y, dict): + + raise TypeError(f"The return value of {get_func_signature(network.forward)} should be dict, got {type(y)}.") return y def grad_backward(self, loss): @@ -231,11 +269,11 @@ def best_eval_result(self, metrics): WARNING_CHECK_LEVEL = 1 STRICT_CHECK_LEVEL = 2 -def _check_code(dataset, model, batch_size=DEFAULT_CHECK_BATCH_SIZE, dev_data=None, check_level=WARNING_CHECK_LEVEL): +def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, + dev_data=None, + check_level=WARNING_CHECK_LEVEL): # check get_loss 方法 model_name = model.__class__.__name__ - if not hasattr(model, 'get_loss'): - raise AttributeError("{} has to have a 'get_loss' function.".format(model_name)) batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_count, (batch_x, batch_y) in enumerate(batch): @@ -248,23 +286,26 @@ def _check_code(dataset, model, batch_size=DEFAULT_CHECK_BATCH_SIZE, dev_data=No refined_batch_x = _build_args(model.forward, **batch_x) output = model(**refined_batch_x) func_signature = get_func_signature(model.forward) - assert isinstance(output, dict), "The return value of {} should be dict.".format(func_signature) + if not isinstance(output, dict): + raise TypeError(f"The return value of {func_signature} should be `dict`, not `{type(output)}`.") # loss check - if batch_count == 0: - _check_loss_evaluate(prev_func=model.forward, func=model.get_loss, check_level=check_level, - output=output, batch_y=batch_y) - loss_input = _build_args(model.get_loss, **output, **batch_y) - loss = model.get_loss(**loss_input) + if isinstance(losser, type): # 这种情况,用户传的是losser.CE这种未初始化的loss + # 需要保证output与batch_y是无歧义的? + # (1) output和batch_y长度为1 + # (2) output和batch_y的key是和losser接受的完全一致 + pass + + loss = losser(output, batch_y) # check loss output if batch_count == 0: if not isinstance(loss, torch.Tensor): - raise ValueError("The return value of {}.get_loss() should be torch.Tensor, but {} got.". - format(model_name, type(loss))) + raise ValueError("The return value of {} should be torch.Tensor, but got {}.". + format(type(losser), type(loss))) if len(loss.size())!=0: - raise ValueError("The size of return value of {}.get_loss() is {}, should be torch.size([])".format( - model_name, loss.size() + raise ValueError("The size of return value of {} is {}, should be torch.size([])".format( + type(losser), loss.size() )) loss.backward() model.zero_grad() @@ -272,26 +313,29 @@ def _check_code(dataset, model, batch_size=DEFAULT_CHECK_BATCH_SIZE, dev_data=No break if dev_data is not None: - if not hasattr(model, 'evaluate'): - raise AttributeError("{} has to have a 'evaluate' function to do evaluation. Or set" - "dev_data to 'None'." - .format(model_name)) outputs, truths = defaultdict(list), defaultdict(list) dev_batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler()) + # TODO 这里修改为使用tester + + with torch.no_grad(): for batch_count, (batch_x, batch_y) in enumerate(dev_batch): _syn_model_data(model, batch_x, batch_y) if hasattr(model, 'predict'): + if not callable(model.predict): + raise TypeError(f"{get_func_signature(model.predict)} must be callable to be used " + f"for evaluation.") refined_batch_x = _build_args(model.predict, **batch_x) prev_func = model.predict output = prev_func(**refined_batch_x) - func_signature = get_func_signature(model.predict) - assert isinstance(output, dict), "The return value of {} should be dict.".format(func_signature) else: refined_batch_x = _build_args(model.forward, **batch_x) prev_func = model.forward output = prev_func(**refined_batch_x) + func_signature = get_func_signature(prev_func) + if not isinstance(output, dict): + raise TypeError(f"The return value of {func_signature} should be `dict`, not `{type(output)}`") for k, v in output.items(): outputs[k].append(v) for k, v in batch_y.items(): @@ -299,16 +343,15 @@ def _check_code(dataset, model, batch_size=DEFAULT_CHECK_BATCH_SIZE, dev_data=No if batch_count+1>DEFAULT_CHECK_NUM_BATCH: break for k, v in outputs.items(): - outputs[k] = itertools.chain(*v) + outputs[k] = tuple(itertools.chain(*v)) for k, v in truths.items(): - truths[k] = itertools.chain(*v) - _check_loss_evaluate(prev_func=prev_func, func=model.evaluate, check_level=check_level, - output=outputs, batch_y=truths) - refined_input = _build_args(model.evaluate, **outputs, **truths) - metrics = model.evaluate(**refined_input) - func_signature = get_func_signature(model.evaluate) - assert isinstance(metrics, dict), "The return value of {} should be dict.". \ - format(func_signature) + truths[k] = tuple(itertools.chain(*v)) + #TODO 这里需要根据新版的metrics做修改,另外这里需要捕获来自metric的报错,因为需要指导用户debug + + + + + def _check_forward_error(model_func, check_level, batch_x): diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 84faaece..8ffcc7bb 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -3,6 +3,7 @@ import os from collections import Counter from collections import namedtuple +import torch CheckRes = namedtuple('CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed'], verbose=False) @@ -95,7 +96,24 @@ def _check_arg_dict_list(func, args): all_needed=list(all_args)) def get_func_signature(func): - # can only be used in function or class method + """ + + Given a function or method, return its signature. + For example: + (1) function + def func(a, b='a', *args): + xxxx + get_func_signature(func) # 'func(a, b='a', *args)' + (2) method + class Demo: + def __init__(self): + xxx + def forward(self, a, b='a', **args) + demo = Demo() + get_func_signature(demo.forward) # 'Demo.forward(self, a, b='a', **args)' + :param func: a function or a method + :return: str or None + """ if inspect.ismethod(func): class_name = func.__self__.__class__.__name__ signature = inspect.signature(func) @@ -113,10 +131,16 @@ def get_func_signature(func): return signature_str -# move data to model's device -import torch def _syn_model_data(model, *args): - assert len(model.state_dict())!=0, "This model has no parameter." + """ + + move data to model's device, element in *args should be dict. This is a inplace change. + :param model: + :param args: + :return: + """ + if len(model.state_dict())==0: + raise ValueError("model has no parameter.") device = model.parameters().__next__().device for arg in args: if isinstance(arg, dict): @@ -124,4 +148,33 @@ def _syn_model_data(model, *args): if isinstance(value, torch.Tensor): arg[key] = value.to(device) else: - raise ValueError("Only support dict type right now.") \ No newline at end of file + raise TypeError("Only support `dict` type right now.") + +def _prepare_metrics(metrics): + """ + + Prepare list of Metric based on input + :param metrics: + :return: + """ + _metrics = [] + if metrics: + if isinstance(metrics, list): + for metric in metrics: + if isinstance(metric, type): + metric = metric() + if isinstance(metric, None): + _metrics.append(metric) + else: + raise TypeError("The type of metric in metrics must be xxxx, not {}.".format( + type(), type(metric) + )) + elif isinstance(metrics, None): + _metrics = [metrics] + else: + raise TypeError("The type of metrics should be `list[xxx]` or `xxx`, got {}.".format( + type(metrics) + )) + + return _metrics + From 6427e85e8f7540cf60203dab16a0a4f04ce9b5ef Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sat, 1 Dec 2018 15:44:52 +0800 Subject: [PATCH 06/67] =?UTF-8?q?=E5=8D=87=E7=BA=A7Vocab=EF=BC=9A=20*=20?= =?UTF-8?q?=E5=A2=9E=E9=87=8F=E6=B7=BB=E5=8A=A0=E5=8D=95=E8=AF=8D=E5=88=B0?= =?UTF-8?q?=E8=AF=8D=E5=85=B8=E4=B8=AD=20*=20lazy=20update:=20=E5=BD=93?= =?UTF-8?q?=E7=94=A8=E5=88=B0=E8=AF=8D=E5=85=B8=E7=9A=84=E6=97=B6=E5=80=99?= =?UTF-8?q?=E6=89=8D=E9=87=8D=E6=96=B0build=20*=20=E5=BD=93=E6=96=B0?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E7=9A=84=E8=AF=8D=E5=AF=BC=E8=87=B4=E8=AF=8D?= =?UTF-8?q?=E5=85=B8=E5=A4=A7=E5=B0=8F=E8=B6=85=E5=87=BA=E9=99=90=E5=88=B6?= =?UTF-8?q?=E6=97=B6=EF=BC=8C=E6=89=93=E5=8D=B0=E4=B8=80=E4=B8=AAwarning?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update Vocabulary: * More words can be added after the building. * Lazy update: rebuild automatically when vocab is used. * print warning when max size is reached --- fastNLP/core/vocabulary.py | 30 ++++++++++++++++++++++++++++-- test/core/test_vocabulary.py | 27 +++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index 7b0ab614..ca6b4ebf 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -16,14 +16,35 @@ def isiterable(p_object): def check_build_vocab(func): + """A decorator to make sure the indexing is built before used. + + """ + def _wrapper(self, *args, **kwargs): - if self.word2idx is None: + if self.word2idx is None or self.rebuild is True: self.build_vocab() return func(self, *args, **kwargs) return _wrapper +def check_build_status(func): + """A decorator to check whether the vocabulary updates after the last build. + + """ + + def _wrapper(self, *args, **kwargs): + if self.rebuild is False: + self.rebuild = True + if self.max_size is not None and len(self.word_count) >= self.max_size: + print("[Warning] Vocabulary has reached the max size {} when calling {} method. " + "Adding more words may cause unexpected behaviour of Vocabulary. ".format( + self.max_size, func.__name__)) + return func(self, *args, **kwargs) + + return _wrapper + + class Vocabulary(object): """Use for word and index one to one mapping @@ -54,7 +75,9 @@ def __init__(self, need_default=True, max_size=None, min_freq=None): self.unknown_label = None self.word2idx = None self.idx2word = None + self.rebuild = True + @check_build_status def update(self, word_lst): """Add a list of words into the vocabulary. @@ -62,6 +85,7 @@ def update(self, word_lst): """ self.word_count.update(word_lst) + @check_build_status def add(self, word): """Add a single word into the vocabulary. @@ -69,6 +93,7 @@ def add(self, word): """ self.word_count[word] += 1 + @check_build_status def add_word(self, word): """Add a single word into the vocabulary. @@ -76,6 +101,7 @@ def add_word(self, word): """ self.add(word) + @check_build_status def add_word_lst(self, word_lst): """Add a list of words into the vocabulary. @@ -101,6 +127,7 @@ def build_vocab(self): start_idx = len(self.word2idx) self.word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)}) self.build_reverse_vocab() + self.rebuild = False def build_reverse_vocab(self): """Build 'index to word' dict based on 'word to index' dict. @@ -188,4 +215,3 @@ def __setstate__(self, state): """ self.__dict__.update(state) self.build_reverse_vocab() - diff --git a/test/core/test_vocabulary.py b/test/core/test_vocabulary.py index e140b1aa..e453e935 100644 --- a/test/core/test_vocabulary.py +++ b/test/core/test_vocabulary.py @@ -59,3 +59,30 @@ def test_to_word(self): vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) vocab.update(text) self.assertEqual(text, [vocab.to_word(idx) for idx in [vocab[w] for w in text]]) + + +class TestOther(unittest.TestCase): + def test_additional_update(self): + vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) + vocab.update(text) + + _ = vocab["well"] + self.assertEqual(vocab.rebuild, False) + + vocab.add("hahaha") + self.assertEqual(vocab.rebuild, True) + + _ = vocab["hahaha"] + self.assertEqual(vocab.rebuild, False) + self.assertTrue("hahaha" in vocab) + + def test_warning(self): + vocab = Vocabulary(need_default=True, max_size=len(set(text)), min_freq=None) + vocab.update(text) + self.assertEqual(vocab.rebuild, True) + print(len(vocab)) + self.assertEqual(vocab.rebuild, False) + + vocab.update(["hahahha", "hhh", "vvvv", "ass", "asss", "jfweiong", "eqgfeg", "feqfw"]) + # this will print a warning + self.assertEqual(vocab.rebuild, True) From 3120cdd09a8f83378b59fd7e4f71da16ba4f7b12 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sat, 1 Dec 2018 17:23:25 +0800 Subject: [PATCH 07/67] =?UTF-8?q?=E6=9B=B4=E6=96=B0embed=5Floader:=20*=20?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0fast=5Fload=5Fembedding=E6=96=B9=E6=B3=95?= =?UTF-8?q?=EF=BC=8C=E7=94=A8vocab=E7=9A=84=E8=AF=8D=E7=B4=A2=E5=BC=95pre-?= =?UTF-8?q?trained=E4=B8=AD=E7=9A=84embedding=20*=20=E5=A6=82=E6=9E=9Cvoca?= =?UTF-8?q?b=E6=9C=89=E8=AF=8D=E6=B2=A1=E5=87=BA=E7=8E=B0=E5=9C=A8pre-trai?= =?UTF-8?q?n=E4=B8=AD=EF=BC=8C=E4=BB=8E=E5=B7=B2=E6=9C=89embedding?= =?UTF-8?q?=E4=B8=AD=E6=AD=A3=E6=80=81=E9=87=87=E6=A0=B7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update embed_loader: * add fast_load_embedding method, to index pre-trained embedding with words in Vocab * If words in Vocab are not exist in pre-trained, sample them from normal distribution computed by current embeddings --- fastNLP/io/embed_loader.py | 77 ++++++++++++++++------- test/data_for_tests/glove.6B.50d_test.txt | 2 - test/io/test_embed_loader.py | 12 ++++ 3 files changed, 66 insertions(+), 25 deletions(-) create mode 100644 test/io/test_embed_loader.py diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index 878ea1b6..6e557c2b 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -1,3 +1,4 @@ +import numpy as np import torch from fastNLP.core.vocabulary import Vocabulary @@ -26,7 +27,7 @@ def _load_glove(emb_file): emb = {} with open(emb_file, 'r', encoding='utf-8') as f: for line in f: - line = list(filter(lambda w: len(w)>0, line.strip().split(' '))) + line = list(filter(lambda w: len(w) > 0, line.strip().split(' '))) if len(line) > 2: emb[line[0]] = torch.Tensor(list(map(float, line[1:]))) return emb @@ -35,9 +36,9 @@ def _load_glove(emb_file): def _load_pretrain(emb_file, emb_type): """Read txt data from embedding file and convert to np.array as pre-trained embedding - :param emb_file: str, the pre-trained embedding file path - :param emb_type: str, the pre-trained embedding data format - :return dict: {str: np.array} + :param str emb_file: the pre-trained embedding file path + :param str emb_type: the pre-trained embedding data format + :return dict embedding: `{str: np.array}` """ if emb_type == 'glove': return EmbedLoader._load_glove(emb_file) @@ -45,38 +46,68 @@ def _load_pretrain(emb_file, emb_type): raise Exception("embedding type {} not support yet".format(emb_type)) @staticmethod - def load_embedding(emb_dim, emb_file, emb_type, vocab, emb_pkl): + def load_embedding(emb_dim, emb_file, emb_type, vocab): """Load the pre-trained embedding and combine with the given dictionary. - :param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding. - :param emb_file: str, the pre-trained embedding file path. - :param emb_type: str, the pre-trained embedding format, support glove now - :param vocab: Vocabulary, a mapping from word to index, can be provided by user or built from pre-trained embedding - :param emb_pkl: str, the embedding pickle file. + :param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding. + :param str emb_file: the pre-trained embedding file path. + :param str emb_type: the pre-trained embedding format, support glove now + :param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding :return embedding_tensor: Tensor of shape (len(word_dict), emb_dim) vocab: input vocab or vocab built by pre-train - TODO: fragile code + """ - # If the embedding pickle exists, load it and return. - # if os.path.exists(emb_pkl): - # with open(emb_pkl, "rb") as f: - # embedding_tensor, vocab = _pickle.load(f) - # return embedding_tensor, vocab - # Otherwise, load the pre-trained embedding. pretrain = EmbedLoader._load_pretrain(emb_file, emb_type) if vocab is None: # build vocabulary from pre-trained embedding vocab = Vocabulary() for w in pretrain.keys(): - vocab.update(w) + vocab.add(w) embedding_tensor = torch.randn(len(vocab), emb_dim) for w, v in pretrain.items(): if len(v.shape) > 1 or emb_dim != v.shape[0]: - raise ValueError('pretrian embedding dim is {}, dismatching required {}'.format(v.shape, (emb_dim,))) + raise ValueError( + "Pretrained embedding dim is {}. Dimension dismatched. Required {}".format(v.shape, (emb_dim,))) if vocab.has_word(w): embedding_tensor[vocab[w]] = v - - # save and return the result - # with open(emb_pkl, "wb") as f: - # _pickle.dump((embedding_tensor, vocab), f) return embedding_tensor, vocab + + @staticmethod + def parse_glove_line(line): + line = list(filter(lambda w: len(w) > 0, line.strip().split(" "))) + if len(line) <= 2: + raise RuntimeError("something goes wrong in parsing glove embedding") + return line[0], torch.Tensor(list(map(float, line[1:]))) + + @staticmethod + def fast_load_embedding(emb_dim, emb_file, vocab): + """Fast load the pre-trained embedding and combine with the given dictionary. + This loading method uses line-by-line operation. + + :param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding. + :param str emb_file: the pre-trained embedding file path. + :param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding + :return numpy.ndarray embedding_matrix: + + """ + if vocab is None: + raise RuntimeError("You must provide a vocabulary.") + embedding_matrix = np.zeros(shape=(len(vocab), emb_dim)) + hit_flags = np.zeros(shape=(len(vocab),), dtype=int) + with open(emb_file, "r", encoding="utf-8") as f: + for line in f: + word, vector = EmbedLoader.parse_glove_line(line) + if word in vocab: + if len(vector.shape) > 1 or emb_dim != vector.shape[0]: + raise ValueError("Pre-trained embedding dim is {}. Expect {}.".format(vector.shape, (emb_dim,))) + embedding_matrix[vocab[word]] = vector + hit_flags[vocab[word]] = 1 + + if np.sum(hit_flags) < len(vocab): + # some words from vocab are missing in pre-trained embedding + # we normally sample them + vocab_embed = embedding_matrix[np.where(hit_flags)] + mean, cov = vocab_embed.mean(axis=0), np.cov(vocab_embed.T) + sampled_vectors = np.random.multivariate_normal(mean, cov, size=(len(vocab) - np.sum(hit_flags),)) + embedding_matrix[np.where(1 - hit_flags)] = sampled_vectors + return embedding_matrix diff --git a/test/data_for_tests/glove.6B.50d_test.txt b/test/data_for_tests/glove.6B.50d_test.txt index cd71b26e..8b443cca 100644 --- a/test/data_for_tests/glove.6B.50d_test.txt +++ b/test/data_for_tests/glove.6B.50d_test.txt @@ -8,5 +8,3 @@ in 0.33042 0.24995 -0.60874 0.10923 0.036372 0.151 -0.55083 -0.074239 -0.092307 a 0.21705 0.46515 -0.46757 0.10082 1.0135 0.74845 -0.53104 -0.26256 0.16812 0.13182 -0.24909 -0.44185 -0.21739 0.51004 0.13448 -0.43141 -0.03123 0.20674 -0.78138 -0.20148 -0.097401 0.16088 -0.61836 -0.18504 -0.12461 -2.2526 -0.22321 0.5043 0.32257 0.15313 3.9636 -0.71365 -0.67012 0.28388 0.21738 0.14433 0.25926 0.23434 0.4274 -0.44451 0.13813 0.36973 -0.64289 0.024142 -0.039315 -0.26037 0.12017 -0.043782 0.41013 0.1796 " 0.25769 0.45629 -0.76974 -0.37679 0.59272 -0.063527 0.20545 -0.57385 -0.29009 -0.13662 0.32728 1.4719 -0.73681 -0.12036 0.71354 -0.46098 0.65248 0.48887 -0.51558 0.039951 -0.34307 -0.014087 0.86488 0.3546 0.7999 -1.4995 -1.8153 0.41128 0.23921 -0.43139 3.6623 -0.79834 -0.54538 0.16943 -0.82017 -0.3461 0.69495 -1.2256 -0.17992 -0.057474 0.030498 -0.39543 -0.38515 -1.0002 0.087599 -0.31009 -0.34677 -0.31438 0.75004 0.97065 's 0.23727 0.40478 -0.20547 0.58805 0.65533 0.32867 -0.81964 -0.23236 0.27428 0.24265 0.054992 0.16296 -1.2555 -0.086437 0.44536 0.096561 -0.16519 0.058378 -0.38598 0.086977 0.0033869 0.55095 -0.77697 -0.62096 0.092948 -2.5685 -0.67739 0.10151 -0.48643 -0.057805 3.1859 -0.017554 -0.16138 0.055486 -0.25885 -0.33938 -0.19928 0.26049 0.10478 -0.55934 -0.12342 0.65961 -0.51802 -0.82995 -0.082739 0.28155 -0.423 -0.27378 -0.007901 -0.030231 - - diff --git a/test/io/test_embed_loader.py b/test/io/test_embed_loader.py new file mode 100644 index 00000000..0a7c4fcf --- /dev/null +++ b/test/io/test_embed_loader.py @@ -0,0 +1,12 @@ +import unittest + +from fastNLP.core.vocabulary import Vocabulary +from fastNLP.io.embed_loader import EmbedLoader + + +class TestEmbedLoader(unittest.TestCase): + def test_case(self): + vocab = Vocabulary() + vocab.update(["the", "in", "I", "to", "of", "hahaha"]) + embedding = EmbedLoader().fast_load_embedding(50, "../data_for_tests/glove.6B.50d_test.txt", vocab) + self.assertEqual(tuple(embedding.shape), (len(vocab), 50)) From d8a80ad6c6bddce0f9229db28ebc131e05cd7f6f Mon Sep 17 00:00:00 2001 From: xuyige Date: Sat, 1 Dec 2018 17:28:47 +0800 Subject: [PATCH 08/67] update LossBase class --- fastNLP/core/losses.py | 66 ++++++++++++++++++++++++++++++++++++++---- fastNLP/core/utils.py | 33 +++++++++++++++++++++ 2 files changed, 94 insertions(+), 5 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 1e5a4914..39ba4012 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -1,20 +1,76 @@ import torch +from fastNLP.core.utils import _get_arg_list +from fastNLP.core.utils import _map_args +from fastNLP.core.utils import get_func_signature +from fastNLP.core.utils import _build_args + class LossBase(object): def __init__(self): + # key: name in target function; value: name in output function self.param_map = {} def get_loss(self, *args, **kwargs): raise NotImplementedError - def __call__(self, output_dict, predict_dict): - pass + def __call__(self, output_dict, target_dict): + """ + :param output_dict: A dict from forward function of the network. + :param target_dict: A dict from DataSet.batch_y. + :return: + """ + args, defaults, defaults_val, varargs, kwargs = _get_arg_list(self.get_loss) + if varargs is not None: + raise RuntimeError( + f"The function {get_func_signature(self.get_loss)} should not use Positional Argument." + ) + + param_map = self.param_map + for keys in args: + if keys not in param_map: + param_map.update({keys: keys}) + for keys in defaults: + if keys not in param_map: + param_map.update({keys: keys}) + # param map: key= name in get_loss function, value= name in param dict + reversed_param_map = {val: key for key, val in param_map} + # reversed param map: key= name in param dict, value= name in get_loss function + + param_val_dict = {} + for keys, val in output_dict.items(): + if keys not in target_dict.keys(): + param_val_dict.update({keys: val}) + else: + raise RuntimeError("conflict Error in output dict and target dict with name {}".format(keys)) + for keys, val in target_dict.items(): + if keys not in output_dict.keys(): + param_val_dict.update({keys: val}) + else: + raise RuntimeError("conflict Error in output dict and target dict with name {}".format(keys)) + for keys in args: + if param_map[keys] not in param_val_dict.keys(): + raise RuntimeError("missing param {} in function {}".format(keys, self.get_loss)) -class Loss(LossBase): - def __init__(self): - pass + param_map_val = _map_args(reversed_param_map, **param_val_dict) + param_value = _build_args(**param_map_val) + + loss = self.get_loss(**param_value) + + if not (isinstance(loss, torch.Tensor) and len(loss.size()) == 0): + if not isinstance(loss, torch.Tensor): + raise RuntimeError("loss ERROR: loss except a torch.Tensor but get {}".format(type(loss))) + raise RuntimeError("loss ERROR: len(loss.size()) except 0 but got {}".format(len(loss.size()))) + + return loss + + +class NewLoss(LossBase): + def __init__(self, func, key_map=None, **kwargs): + super(NewLoss).__init__() + if not callable(func): + raise RuntimeError("") def squash(predict, truth, **kwargs): diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 84faaece..13982e27 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -64,6 +64,39 @@ def _build_args(func, **kwargs): return output +def _map_args(maps: dict, **kwargs): + # maps: key=old name, value= new name + output = {} + for name, val in kwargs.items(): + if name in maps: + assert isinstance(maps[name], str) + output.update({maps[name]: val}) + else: + output.update({name: val}) + for keys in maps.keys(): + if keys not in output.keys(): + # TODO: add UNUSED warning. + pass + return output + + +def _get_arg_list(func): + assert callable(func) + spect = inspect.getfullargspec(func) + if spect.defaults is not None: + args = spect.args[: -len(spect.defaults)] + defaults = spect.args[-len(spect.defaults):] + defaults_val = spect.defaults + else: + args = spect.args + defaults = None + defaults_val = None + varargs = spect.varargs + kwargs = spect.varkw + return args, defaults, defaults_val, varargs, kwargs + + + # check args def _check_arg_dict_list(func, args): if isinstance(args, dict): From ad0a8c177554ee1a5c4656ea2c8a06aa369f0ca5 Mon Sep 17 00:00:00 2001 From: yh Date: Sat, 1 Dec 2018 18:27:07 +0800 Subject: [PATCH 09/67] =?UTF-8?q?=E5=A2=9E=E5=8A=A0metric?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/losses.py | 23 +++++++ fastNLP/core/metrics.py | 129 +++++++++++++++++++++++++++++++++++++++- fastNLP/core/tester.py | 56 ++++++++++++----- fastNLP/core/trainer.py | 71 ++++++++++------------ fastNLP/core/utils.py | 53 +++++++---------- 5 files changed, 245 insertions(+), 87 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 1e5a4914..d818c613 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -17,6 +17,29 @@ def __init__(self): pass +class LossInForward(LossBase): + def __init__(self, loss_key='loss'): + super().__init__() + + self.loss_key = loss_key + + def get_loss(self, *args, **kwargs): + pass + + def __call__(self, output_dict, predict_dict): + pass + + +def _prepare_losser(losser): + if losser is None: + losser = LossInForward() + return losser + elif isinstance(losser, LossBase): + return losser + else: + raise TypeError(f"Type of losser should be `fastNLP.LossBase`, got {type(losser)}") + + def squash(predict, truth, **kwargs): '''To reshape tensors in order to fit Loss functions in pytorch diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 94893324..d4d81212 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -1,8 +1,136 @@ + import warnings +import inspect import numpy as np import torch +from fastNLP.core.utils import get_func_signature +from fastNLP.core.utils import _check_arg_dict_list +from fastNLP.core.utils import _build_args + +class MetricBase(object): + def __init__(self): + self.param_map = {} # key is param in function, value is input param. + self._checked = False + + def evaluate(self, *args, **kwargs): + raise NotImplementedError + + def _init_param_map(self, key_map, **kwargs): + self.param_map = {} + for key, value in key_map.items(): + if isinstance(key, str): + raise TypeError(f"key in key_map must be `str`, not `{type(key)}`.") + if isinstance(value, str): + raise TypeError(f"value in key_map must be `str`, not `{type(value)}`.") + self.param_map[key] = value + for key, value in kwargs.items(): + if isinstance(value, str): + raise TypeError(f"in {key}={value}, value must be `str`, not `{type(value)}`.") + self.param_map[key] = value + + def __call__(self, output_dict, target_dict, force_check=False): + """ + :param output_dict: + :param target_dict: + :return: + """ + if not callable(self.evaluate): + raise TypeError(f"{self.__class__.__name__}.evaluate has to be callable, not {type(self.evaluate)}.") + + if not self._checked: + # 1. check consistence between signature and param_map + func_spect = inspect.getfullargspec(self.evaluate) + func_args = func_spect.args + for func_param, input_param in self.param_map.items(): + if func_param not in func_args: + raise NameError(f"{func_param} not in {get_func_signature(self.evaluate)}.") + # 2. only part of the param_map are passed, left are not + for arg in func_args: + if arg not in self.param_map: + self.param_map[arg] = arg #This param does not need mapping. + self._evaluate_args = func_args + + # need to wrap inputs in dict. + mapped_output_dict = {} + mapped_target_dict = {} + for func_arg in self._evaluate_args: + input_arg = self.param_map[func_arg] + if input_arg in output_dict: + mapped_output_dict[func_arg] = output_dict[input_arg] + if input_arg in target_dict: + mapped_target_dict[func_arg] = target_dict[input_arg] + + # check duplicated, unused, missing + if force_check or not self._checked: + check_res = _check_arg_dict_list(self.evaluate, [mapped_output_dict, mapped_output_dict]) + self._reverse_param_map = {value:key for key, value in check_res.items()} + for key, value in check_res.items(): + new_value = value.copy() + for idx, func_param in enumerate(value): + if func_param in self._reverse_param_map: + new_value[idx] = self._reverse_param_map[func_param] + if check_res.missing or check_res.duplicated: + raise CheckError(check_res=check_res) + refined_args = _build_args(self.evaluate, **mapped_output_dict, **mapped_target_dict) + + metrics = self.evaluate(**refined_args) + + if not isinstance(metrics, dict): + raise TypeError(f"The return value of {get_func_signature(self.evaluate)} must be `dict`, " + f"got {type(metrics)}.") + self._checked = True + + return metrics + + + + + +class CheckError(Exception): + def __init__(self, check_res): + + err = '' + if check_res.missing: + err += f'Missing: {check_res.missing}\n' + if check_res.duplicated: + err += f'Duplicated: {check_res.duplicated}\n' + self.check_res = check_res + + def __str__(self): + pass + + +class Metric(MetricBase): + def __init__(self, func, key_map, **kwargs): + super().__init__() + pass + +def _prepare_metrics(metrics): + """ + + Prepare list of Metric based on input + :param metrics: + :return: List[fastNLP.MetricBase] + """ + _metrics = [] + if metrics: + if isinstance(metrics, list): + for metric in metrics: + if isinstance(metric, type): + metric = metric() + if isinstance(metric, MetricBase): + _metrics.append(metric) + else: + raise TypeError(f"The type of metric in metrics must be `fastNLP.MetricBase`, not `{type(metric)}`.") + elif isinstance(metrics, MetricBase): + _metrics = [metrics] + else: + raise TypeError("The type of metrics should be `list[fastNLP.MetricBase]` or `fastNLP.MetricBase`, got {}." + .format(type(metrics))) + return _metrics + class Evaluator(object): def __init__(self): @@ -17,7 +145,6 @@ def __call__(self, predict, truth): """ raise NotImplementedError - class ClassifyEvaluator(Evaluator): def __init__(self): super(ClassifyEvaluator, self).__init__() diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 5d264b80..a66ce234 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -2,32 +2,49 @@ from collections import defaultdict import torch +from torch import nn from fastNLP.core.batch import Batch from fastNLP.core.sampler import RandomSampler +from fastNLP.core.dataset import DataSet from fastNLP.core.utils import _build_args from fastNLP.core.utils import get_func_signature +from fastNLP.core.utils import _move_dict_value_to_device +from fastNLP.core.metrics import _prepare_metrics class Tester(object): """An collection of model inference and evaluation of performance, used over validation/dev set and test set. """ def __init__(self, data, model, metrics, batch_size=16, use_cuda=False, verbose=0): super(Tester, self).__init__() - self.use_cuda = use_cuda + + if not isinstance(data, DataSet): + raise TypeError(f"The type of data must be `fastNLP.DataSet`, got `{type(data)}`.") + if not isinstance(model, nn.Module): + raise TypeError(f"The type of model must be `torch.nn.Module`, got `{type(model)}`.") + + self.metrics = _prepare_metrics(metrics) + + # check predict + if hasattr(self._model, 'predict'): + self._predict_func = self._model.predict + if not callable(self._predict_func): + _model_name = model.__class__.__name__ + raise TypeError(f"`{_model_name}.predict` must be callable to be used " + f"for evaluation, not `{type(self._predict_func)}`.") + else: + self._predict_func = self._model + self.data = data - self.batch_size = batch_size - self.verbose = verbose if torch.cuda.is_available() and self.use_cuda: self._model = model.cuda() else: self._model = model - if hasattr(self._model, 'predict'): - if not callable(self._model.predict): - raise TypeError(f"{get_func_signature(model.predict)} must be callable to be used " - f"for evaluation.") - self._predict_func = self._model.predict - else: - self._predict_func = self._model + self.use_cuda = use_cuda + self.batch_size = batch_size + self.verbose = verbose + + self._model_device = model.parameters().__next__().device def test(self): @@ -39,6 +56,7 @@ def test(self): with torch.no_grad(): for batch_x, batch_y in data_iterator: + _move_dict_value_to_device(self._model_device, batch_x, batch_y) prediction = self.data_forward(network, batch_x) assert isinstance(prediction, dict) for k, v in prediction.items(): @@ -49,10 +67,13 @@ def test(self): output[k] = itertools.chain(*v) for k, v in truths.items(): truths[k] = itertools.chain(*v) - # args = _build_args(self._evaluator, **output, **truths) - eval_results = self._evaluator(**args) + eval_results = {} + for metric in self.metrics: + eval_result = metric(output, truths) + metric_name = metric.__class__.__name__ + eval_results[metric_name] = eval_result if self.verbose >= 0: - print("[tester] {}".format(self.print_eval_results(eval_results))) + print("[tester] \n{}".format(self.format_eval_results(eval_results))) self.mode(network, is_test=False) return eval_results @@ -74,10 +95,15 @@ def data_forward(self, network, x): y = self._predict_func(**x) return y - def print_eval_results(self, results): + def format_eval_results(self, results): """Override this method to support more print formats. :param results: dict, (str: float) is (metrics name: value) """ - return ", ".join([str(key) + "=" + str(value) for key, value in results.items()]) + _str = '' + for metric_name, metric_result in results.items(): + _str += metric_name + '\n\t' + _str += ", ".join([str(key) + "=" + str(value) for key, value in results.items()]) + _str += '\n' + return _str diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 4febdfce..97b420c5 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -17,10 +17,15 @@ from fastNLP.core.tester import Tester from fastNLP.core.utils import _build_args from fastNLP.core.utils import _check_arg_dict_list -from fastNLP.core.utils import _syn_model_data +from fastNLP.core.utils import _move_dict_value_to_device from fastNLP.core.utils import get_func_signature from fastNLP.core.dataset import DataSet +from fastNLP.core.losses import LossBase +from fastNLP.core.metrics import MetricBase +from fastNLP.core.losses import _prepare_losser +from fastNLP.core.metrics import _prepare_metrics + class Trainer(object): """Main Training Loop @@ -32,6 +37,25 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat **kwargs): super(Trainer, self).__init__() + if not isinstance(train_data, DataSet): + raise TypeError(f"The type of train_data must be fastNLP.DataSet, got {type(train_data)}.") + if not isinstance(model, nn.Module): + raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.") + + # check metrics and dev_data + if (not metrics) and dev_data is not None: + raise ValueError("No metric for dev_data evaluation.") + if metrics and (dev_data is None): + raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ") + + # prepare evaluate + metrics = _prepare_metrics(metrics) + # prepare loss + losser = _prepare_losser(losser) + + if need_check_code: + _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data) + self.train_data = train_data self.dev_data = dev_data # If None, No validation. self.model = model @@ -45,10 +69,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat self.validate_every = int(validate_every) self._best_accuracy = 0 - - # TODO check loss与metrics的类型 - - + self._model_device = model.parameters().__next__().device # TODO self._best_accuracy不能表现出当前的metric多种的情况 @@ -72,38 +93,6 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat # print(self.__dict__) - def _check_params(self, train_data, model, losser, metrics=[], n_epochs=3, batch_size=32, print_every=-1, - validate_every=-1, dev_data=None, use_cuda=False, save_path="./save", - optimizer=Optimizer("Adam", lr=0.01, weight_decay=0), need_check_code=True, - **kwargs): - if not isinstance(train_data, DataSet): - raise TypeError("The type of train_data must be fastNLP.DataSet, got {}.".\ - format(type(train_data))) - if not isinstance(model, nn.Module): - raise TypeError("The type of model must be torch.nn.Module, got {}.".\ - format(type(model))) - if losser is not None: - # TODO change - if not isinstance(losser, None): - raise TypeError("The type of losser must be xxx, got {}.".\ - format(type(losser))) - - # check metrics and dev_data - if (not metrics) and dev_data is not None: - raise ValueError("No metric for dev_data evaluation.") - if metrics and (dev_data is None): - raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ") - - # check loss - if isinstance(losser, type): - self.losser = losser() - if not isinstance(self.losser, None): - raise TypeError(f'The type of losser must be `{}`, got {type(self.losser)}.') - - if need_check_code: - _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data) - - def train(self): """Start Training. @@ -153,8 +142,9 @@ def _train_epoch(self, data_iterator, model, epoch, dev_data, start, **kwargs): - epoch: int, """ for batch_x, batch_y in data_iterator: + # TODO 这里可能会遇到问题,万一用户在model内部修改了prediction的device就会有问题 + _move_dict_value_to_device(self._model_device, batch_x, batch_y) prediction = self.data_forward(model, batch_x) - loss = self.get_loss(prediction, batch_y) self.grad_backward(loss) self.update() @@ -205,7 +195,6 @@ def data_forward(self, network, x): x = _build_args(network.forward, **x) y = network(**x) if not isinstance(y, dict): - raise TypeError(f"The return value of {get_func_signature(network.forward)} should be dict, got {type(y)}.") return y @@ -299,7 +288,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ # check loss output if batch_count == 0: if not isinstance(loss, torch.Tensor): - raise ValueError("The return value of {} should be torch.Tensor, but got {}.". + raise ValueError("The return value of {} should be `torch.Tensor`, but got `{}`.". format(type(losser), type(loss))) if len(loss.size())!=0: raise ValueError("The size of return value of {} is {}, should be torch.size([])".format( @@ -314,7 +303,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ outputs, truths = defaultdict(list), defaultdict(list) dev_batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler()) # TODO 这里修改为使用tester - + tester = Tester(data=dataset, model=model, metrics=metrics, batch_size=batch_size, ) with torch.no_grad(): for batch_count, (batch_x, batch_y) in enumerate(dev_batch): diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 8ffcc7bb..97ed83d9 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -3,11 +3,9 @@ import os from collections import Counter from collections import namedtuple +from collections import defaultdict import torch -CheckRes = namedtuple('CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed'], verbose=False) - - def save_pickle(obj, pickle_path, file_name): """Save an object into a pickle file. @@ -89,11 +87,15 @@ def _check_arg_dict_list(func, args): input_args = set(input_arg_count.keys()) missing = list(require_args - input_args) unused = list(input_args - all_args) - return CheckRes(missing=missing, - unused=unused, - duplicated=duplicated, - required=list(require_args), - all_needed=list(all_args)) + + check_res = {} + check_res['missing'] = missing + check_res['unused'] = unused + check_res['duplicated'] = duplicated + check_res['required'] = list(require_args) + check_res['all_needed'] = list(all_args) + + return check_res def get_func_signature(func): """ @@ -150,31 +152,22 @@ def _syn_model_data(model, *args): else: raise TypeError("Only support `dict` type right now.") -def _prepare_metrics(metrics): +def _move_dict_value_to_device(device, *args): """ - Prepare list of Metric based on input - :param metrics: + move data to model's device, element in *args should be dict. This is a inplace change. + :param device: torch.device + :param args: :return: """ - _metrics = [] - if metrics: - if isinstance(metrics, list): - for metric in metrics: - if isinstance(metric, type): - metric = metric() - if isinstance(metric, None): - _metrics.append(metric) - else: - raise TypeError("The type of metric in metrics must be xxxx, not {}.".format( - type(), type(metric) - )) - elif isinstance(metrics, None): - _metrics = [metrics] - else: - raise TypeError("The type of metrics should be `list[xxx]` or `xxx`, got {}.".format( - type(metrics) - )) + if not isinstance(device, torch.device): + raise TypeError(f"device must be `torch.device`, got `{type(device)}`") - return _metrics + for arg in args: + if isinstance(arg, dict): + for key, value in arg.items(): + if isinstance(value, torch.Tensor): + arg[key] = value.to(device) + else: + raise TypeError("Only support `dict` type right now.") From 37e282d3243405d4289ad87432bba7ed81dc6d1f Mon Sep 17 00:00:00 2001 From: xuyige Date: Sat, 1 Dec 2018 18:31:16 +0800 Subject: [PATCH 10/67] update LossBase class --- fastNLP/core/losses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 39ba4012..760222f7 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -51,7 +51,7 @@ def __call__(self, output_dict, target_dict): for keys in args: if param_map[keys] not in param_val_dict.keys(): - raise RuntimeError("missing param {} in function {}".format(keys, self.get_loss)) + raise RuntimeError(f"missing param {keys} in function {get_func_signature(self.get_loss)}") param_map_val = _map_args(reversed_param_map, **param_val_dict) param_value = _build_args(**param_map_val) From 2c8bd9575a8d08116e7bc0aad33ef8dd540703bb Mon Sep 17 00:00:00 2001 From: yh Date: Sat, 1 Dec 2018 19:55:23 +0800 Subject: [PATCH 11/67] add _method_function --- fastNLP/core/utils.py | 44 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 82e3d07c..efc2ef7e 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -3,9 +3,9 @@ import os from collections import Counter from collections import namedtuple -from collections import defaultdict import torch +CheckRes = namedtuple('CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed'], verbose=False) def save_pickle(obj, pickle_path, file_name): """Save an object into a pickle file. @@ -121,14 +121,11 @@ def _check_arg_dict_list(func, args): missing = list(require_args - input_args) unused = list(input_args - all_args) - check_res = {} - check_res['missing'] = missing - check_res['unused'] = unused - check_res['duplicated'] = duplicated - check_res['required'] = list(require_args) - check_res['all_needed'] = list(all_args) - - return check_res + return CheckRes(missing=missing, + unused=unused, + duplicated=duplicated, + required=list(require_args), + all_needed=list(all_args)) def get_func_signature(func): """ @@ -165,6 +162,19 @@ def forward(self, a, b='a', **args) signature_str = func.__name__ + signature_str return signature_str +def _is_function_or_method(func): + """ + + :param func: + :return: + """ + if not inspect.ismethod(func) and not inspect.isfunction(func): + return False + return True + +def _check_function_or_method(func): + if not _is_function_or_method(func): + raise TypeError(f"{type(func)} is not a method or function.") def _syn_model_data(model, *args): """ @@ -204,3 +214,19 @@ def _move_dict_value_to_device(device, *args): else: raise TypeError("Only support `dict` type right now.") + +class CheckError(Exception): + """ + + CheckError. Used in losses.LossBase, metrics.MetricBase. + """ + def __init__(self, check_res): + err = '' + if check_res['missing']: + err += f"Missing: {check_res['missing']}\n" + if check_res['duplicated']: + err += f"Duplicated: {check_res['duplicated']}\n" + if check_res['unused']: + err += f"Unused: {check_res['unused']}\n" + Exception.__init__(self, err) + self.check_res = check_res From 0d4720b1d91648fa61683d9dde13d9e183b9c003 Mon Sep 17 00:00:00 2001 From: yh Date: Sat, 1 Dec 2018 20:14:43 +0800 Subject: [PATCH 12/67] CheckError add function --- fastNLP/core/metrics.py | 28 ++++--------- fastNLP/core/tester.py | 30 ++++++++------ fastNLP/core/trainer.py | 87 +++++++++++------------------------------ fastNLP/core/utils.py | 17 ++++---- 4 files changed, 57 insertions(+), 105 deletions(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index d4d81212..60e0d82f 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -8,6 +8,8 @@ from fastNLP.core.utils import get_func_signature from fastNLP.core.utils import _check_arg_dict_list from fastNLP.core.utils import _build_args +from fastNLP.core.utils import CheckError + class MetricBase(object): def __init__(self): @@ -29,7 +31,7 @@ def _init_param_map(self, key_map, **kwargs): if isinstance(value, str): raise TypeError(f"in {key}={value}, value must be `str`, not `{type(value)}`.") self.param_map[key] = value - + def __call__(self, output_dict, target_dict, force_check=False): """ :param output_dict: @@ -67,7 +69,7 @@ def __call__(self, output_dict, target_dict, force_check=False): check_res = _check_arg_dict_list(self.evaluate, [mapped_output_dict, mapped_output_dict]) self._reverse_param_map = {value:key for key, value in check_res.items()} for key, value in check_res.items(): - new_value = value.copy() + new_value = list(value) for idx, func_param in enumerate(value): if func_param in self._reverse_param_map: new_value[idx] = self._reverse_param_map[func_param] @@ -85,28 +87,12 @@ def __call__(self, output_dict, target_dict, force_check=False): return metrics - - - -class CheckError(Exception): - def __init__(self, check_res): - - err = '' - if check_res.missing: - err += f'Missing: {check_res.missing}\n' - if check_res.duplicated: - err += f'Duplicated: {check_res.duplicated}\n' - self.check_res = check_res - - def __str__(self): - pass - - class Metric(MetricBase): def __init__(self, func, key_map, **kwargs): super().__init__() pass + def _prepare_metrics(metrics): """ @@ -127,8 +113,8 @@ def _prepare_metrics(metrics): elif isinstance(metrics, MetricBase): _metrics = [metrics] else: - raise TypeError("The type of metrics should be `list[fastNLP.MetricBase]` or `fastNLP.MetricBase`, got {}." - .format(type(metrics))) + raise TypeError(f"The type of metrics should be `list[fastNLP.MetricBase]` or `fastNLP.MetricBase`, " + f"got {type(metrics)}.") return _metrics diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index a66ce234..33d8cc81 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -5,12 +5,13 @@ from torch import nn from fastNLP.core.batch import Batch -from fastNLP.core.sampler import RandomSampler +from fastNLP.core.sampler import SequentialSampler from fastNLP.core.dataset import DataSet from fastNLP.core.utils import _build_args from fastNLP.core.utils import get_func_signature from fastNLP.core.utils import _move_dict_value_to_device from fastNLP.core.metrics import _prepare_metrics +from fastNLP.core.utils import CheckError class Tester(object): """An collection of model inference and evaluation of performance, used over validation/dev set and test set. """ @@ -33,7 +34,7 @@ def __init__(self, data, model, metrics, batch_size=16, use_cuda=False, verbose= raise TypeError(f"`{_model_name}.predict` must be callable to be used " f"for evaluation, not `{type(self._predict_func)}`.") else: - self._predict_func = self._model + self._predict_func = self._model.forward self.data = data if torch.cuda.is_available() and self.use_cuda: @@ -50,14 +51,14 @@ def __init__(self, data, model, metrics, batch_size=16, use_cuda=False, verbose= def test(self): # turn on the testing mode; clean up the history network = self._model - self.mode(network, is_test=True) + self._mode(network, is_test=True) output, truths = defaultdict(list), defaultdict(list) - data_iterator = Batch(self.data, self.batch_size, sampler=RandomSampler(), as_numpy=False) + data_iterator = Batch(self.data, self.batch_size, sampler=SequentialSampler(), as_numpy=False) with torch.no_grad(): for batch_x, batch_y in data_iterator: _move_dict_value_to_device(self._model_device, batch_x, batch_y) - prediction = self.data_forward(network, batch_x) + prediction = self._data_forward(self._predict_func, batch_x) assert isinstance(prediction, dict) for k, v in prediction.items(): output[k].append(v) @@ -68,16 +69,21 @@ def test(self): for k, v in truths.items(): truths[k] = itertools.chain(*v) eval_results = {} + try: for metric in self.metrics: eval_result = metric(output, truths) metric_name = metric.__class__.__name__ eval_results[metric_name] = eval_result + except CheckError as e: + pass + + if self.verbose >= 0: - print("[tester] \n{}".format(self.format_eval_results(eval_results))) - self.mode(network, is_test=False) + print("[tester] \n{}".format(self._format_eval_results(eval_results))) + self._mode(network, is_test=False) return eval_results - def mode(self, model, is_test=False): + def _mode(self, model, is_test=False): """Train mode or Test mode. This is for PyTorch currently. :param model: a PyTorch model @@ -89,13 +95,13 @@ def mode(self, model, is_test=False): else: model.train() - def data_forward(self, network, x): + def _data_forward(self, func, x): """A forward pass of the model. """ - x = _build_args(network.forward, **x) - y = self._predict_func(**x) + x = _build_args(func, **x) + y = func(**x) return y - def format_eval_results(self, results): + def _format_eval_results(self, results): """Override this method to support more print formats. :param results: dict, (str: float) is (metrics name: value) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 97b420c5..da8e54f9 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -25,7 +25,7 @@ from fastNLP.core.metrics import MetricBase from fastNLP.core.losses import _prepare_losser from fastNLP.core.metrics import _prepare_metrics - +from fastNLP.core.utils import CheckError class Trainer(object): """Main Training Loop @@ -211,13 +211,11 @@ def grad_backward(self, loss): def get_loss(self, predict, truth): """Compute loss given prediction and ground truth. - :param predict: prediction label vector - :param truth: ground truth label vector + :param predict: prediction dict, produced by model.forward + :param truth: ground truth dict, produced by batch_y :return: a scalar """ - assert isinstance(predict, dict) and isinstance(truth, dict) - args = _build_args(self.loss_func, **predict, **truth) - return self.loss_func(**args) + return self.losser(predict, truth) def save_model(self, model, model_name, only_param=False): model_name = os.path.join(self.save_path, model_name) @@ -260,11 +258,11 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ dev_data=None, check_level=WARNING_CHECK_LEVEL): # check get_loss 方法 - model_name = model.__class__.__name__ + model_devcie = model.parameters().__next__().device batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_count, (batch_x, batch_y) in enumerate(batch): - _syn_model_data(model, batch_x, batch_y) + _move_dict_value_to_device(model_devcie, batch_x, batch_y) # forward check if batch_count==0: _check_forward_error(model_func=model.forward, check_level=check_level, @@ -277,68 +275,29 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ raise TypeError(f"The return value of {func_signature} should be `dict`, not `{type(output)}`.") # loss check - if isinstance(losser, type): # 这种情况,用户传的是losser.CE这种未初始化的loss - # 需要保证output与batch_y是无歧义的? - # (1) output和batch_y长度为1 - # (2) output和batch_y的key是和losser接受的完全一致 - pass - - loss = losser(output, batch_y) - + try: + loss = losser(output, batch_y) + except CheckError as e: + _check_loss_evaluate(prev_func=model.forward, func=e.func_signature, + check_res=e.check_res, output=output, batch_y=batch_y, + check_level=check_level) # check loss output if batch_count == 0: if not isinstance(loss, torch.Tensor): - raise ValueError("The return value of {} should be `torch.Tensor`, but got `{}`.". - format(type(losser), type(loss))) + raise TypeError(f"The return value of {get_func_signature(losser.__call__)} should be `torch.Tensor`, " + f"but got `{type(loss)}`.") if len(loss.size())!=0: - raise ValueError("The size of return value of {} is {}, should be torch.size([])".format( - type(losser), loss.size() - )) + raise ValueError(f"The size of return value of {get_func_signature(losser.__call__)} is {loss.size()}, " + f"should be torch.size([])") loss.backward() model.zero_grad() if batch_count+1>=DEFAULT_CHECK_NUM_BATCH: break if dev_data is not None: - outputs, truths = defaultdict(list), defaultdict(list) - dev_batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler()) - # TODO 这里修改为使用tester - tester = Tester(data=dataset, model=model, metrics=metrics, batch_size=batch_size, ) - - with torch.no_grad(): - for batch_count, (batch_x, batch_y) in enumerate(dev_batch): - _syn_model_data(model, batch_x, batch_y) - - if hasattr(model, 'predict'): - if not callable(model.predict): - raise TypeError(f"{get_func_signature(model.predict)} must be callable to be used " - f"for evaluation.") - refined_batch_x = _build_args(model.predict, **batch_x) - prev_func = model.predict - output = prev_func(**refined_batch_x) - else: - refined_batch_x = _build_args(model.forward, **batch_x) - prev_func = model.forward - output = prev_func(**refined_batch_x) - func_signature = get_func_signature(prev_func) - if not isinstance(output, dict): - raise TypeError(f"The return value of {func_signature} should be `dict`, not `{type(output)}`") - for k, v in output.items(): - outputs[k].append(v) - for k, v in batch_y.items(): - truths[k].append(v) - if batch_count+1>DEFAULT_CHECK_NUM_BATCH: - break - for k, v in outputs.items(): - outputs[k] = tuple(itertools.chain(*v)) - for k, v in truths.items(): - truths[k] = tuple(itertools.chain(*v)) - #TODO 这里需要根据新版的metrics做修改,另外这里需要捕获来自metric的报错,因为需要指导用户debug - - - - - + tester = Tester(data=dataset[:batch_size*DEFAULT_CHECK_NUM_BATCH], model=model, metrics=metrics, + batch_size=batch_size, verbose=-1) + tester.test() def _check_forward_error(model_func, check_level, batch_x): @@ -346,11 +305,11 @@ def _check_forward_error(model_func, check_level, batch_x): _missing = '' _unused = '' func_signature = get_func_signature(model_func) - if len(check_res.missing)!=0: + if len(check_res['missing'])!=0: _missing = "Function {} misses {}, only provided with {}, " \ ".\n".format(func_signature, check_res.missing, list(batch_x.keys())) - if len(check_res.unused)!=0: + if len(check_res['unused'])!=0: if len(check_res.unused) > 1: _unused = "{} are not used ".format(check_res.unused) else: @@ -370,9 +329,7 @@ def _check_forward_error(model_func, check_level, batch_x): elif check_level == WARNING_CHECK_LEVEL: warnings.warn(message=_unused) -def _check_loss_evaluate(prev_func, func, check_level, output, batch_y): - - check_res = _check_arg_dict_list(func, [output, batch_y]) +def _check_loss_evaluate(prev_func, func, check_res, output, batch_y, check_level): _missing = '' _unused = '' _duplicated = '' diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index efc2ef7e..61c5bc5c 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -220,13 +220,16 @@ class CheckError(Exception): CheckError. Used in losses.LossBase, metrics.MetricBase. """ - def __init__(self, check_res): + def __init__(self, check_res:CheckRes, func_signature:str): err = '' - if check_res['missing']: - err += f"Missing: {check_res['missing']}\n" - if check_res['duplicated']: - err += f"Duplicated: {check_res['duplicated']}\n" - if check_res['unused']: - err += f"Unused: {check_res['unused']}\n" + if check_res.missing: + err += f"Missing: {check_res.missing}\n" + if check_res.duplicated: + err += f"Duplicated: {check_res.duplicated}\n" + if check_res.unused: + err += f"Unused: {check_res.unused}\n" + Exception.__init__(self, err) + self.check_res = check_res + self.func_signature = func_signature From e6864ea7e0f42deff6d50c9e75c639a7a0ddea1f Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sat, 1 Dec 2018 20:27:23 +0800 Subject: [PATCH 13/67] =?UTF-8?q?=E6=9B=B4=E6=96=B0embed=5Floader:=20*=20?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0fast=5Fload=5Fembedding=E6=96=B9=E6=B3=95?= =?UTF-8?q?=EF=BC=8C=E7=94=A8vocab=E7=9A=84=E8=AF=8D=E7=B4=A2=E5=BC=95pre-?= =?UTF-8?q?trained=E4=B8=AD=E7=9A=84embedding=20*=20=E5=A6=82=E6=9E=9Cvoca?= =?UTF-8?q?b=E6=9C=89=E8=AF=8D=E6=B2=A1=E5=87=BA=E7=8E=B0=E5=9C=A8pre-trai?= =?UTF-8?q?n=E4=B8=AD=EF=BC=8C=E4=BB=8E=E5=B7=B2=E6=9C=89embedding?= =?UTF-8?q?=E4=B8=AD=E6=AD=A3=E6=80=81=E9=87=87=E6=A0=B7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update embed_loader: * add fast_load_embedding method, to index pre-trained embedding with words in Vocab * If words in Vocab are not exist in pre-trained, sample them from normal distribution computed by current embeddings --- fastNLP/core/trainer.py | 159 +++++++++++++++++++++++++--------------- 1 file changed, 98 insertions(+), 61 deletions(-) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index da8e54f9..54ce2cd9 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -1,39 +1,38 @@ -import itertools import os import time import warnings -from collections import defaultdict from datetime import datetime from datetime import timedelta import torch -from torch import nn from tensorboardX import SummaryWriter +from torch import nn from fastNLP.core.batch import Batch +from fastNLP.core.dataset import DataSet +from fastNLP.core.losses import _prepare_losser +from fastNLP.core.metrics import _prepare_metrics from fastNLP.core.optimizer import Optimizer from fastNLP.core.sampler import RandomSampler from fastNLP.core.sampler import SequentialSampler from fastNLP.core.tester import Tester +from fastNLP.core.utils import CheckError from fastNLP.core.utils import _build_args from fastNLP.core.utils import _check_arg_dict_list from fastNLP.core.utils import _move_dict_value_to_device from fastNLP.core.utils import get_func_signature -from fastNLP.core.dataset import DataSet -from fastNLP.core.losses import LossBase -from fastNLP.core.metrics import MetricBase -from fastNLP.core.losses import _prepare_losser -from fastNLP.core.metrics import _prepare_metrics -from fastNLP.core.utils import CheckError class Trainer(object): """Main Training Loop """ - def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, print_every=-1, validate_every=-1, + + def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, print_every=-1, + validate_every=-1, dev_data=None, use_cuda=False, save_path="./save", optimizer=Optimizer("Adam", lr=0.01, weight_decay=0), need_check_code=True, + metric_key=None, **kwargs): super(Trainer, self).__init__() @@ -50,6 +49,13 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat # prepare evaluate metrics = _prepare_metrics(metrics) + + # parse metric_key + # increase_better is True. It means the exp result gets better if the indicator increases. + # It is true by default. + self.increase_better = False if metric_key[0] == "-" else True + self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key + # prepare loss losser = _prepare_losser(losser) @@ -67,7 +73,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat self.save_path = save_path self.print_every = int(print_every) self.validate_every = int(validate_every) - self._best_accuracy = 0 + self.best_metric_indicator = None self._model_device = model.parameters().__next__().device @@ -102,7 +108,7 @@ def train(self): if torch.cuda.is_available() and self.use_cuda: self.model = self.model.cuda() - self.mode(self.model, is_test=False) + self._mode(self.model, is_test=False) start = time.time() self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) @@ -112,7 +118,9 @@ class psudoSW: def __getattr__(self, item): def pass_func(*args, **kwargs): pass + return pass_func + self._summary_writer = psudoSW() else: path = os.path.join(self.save_path, 'tensorboard_logs_{}'.format(self.start_time)) @@ -121,13 +129,14 @@ def pass_func(*args, **kwargs): epoch = 1 while epoch <= self.n_epochs: - data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=RandomSampler(), as_numpy=False) + data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=RandomSampler(), + as_numpy=False) self._train_epoch(data_iterator, self.model, epoch, self.dev_data, start) # validate_every override validation at end of epochs if self.dev_data and self.validate_every <= 0: - self.do_validation() + self._do_validation() epoch += 1 finally: self._summary_writer.close() @@ -144,10 +153,10 @@ def _train_epoch(self, data_iterator, model, epoch, dev_data, start, **kwargs): for batch_x, batch_y in data_iterator: # TODO 这里可能会遇到问题,万一用户在model内部修改了prediction的device就会有问题 _move_dict_value_to_device(self._model_device, batch_x, batch_y) - prediction = self.data_forward(model, batch_x) - loss = self.get_loss(prediction, batch_y) - self.grad_backward(loss) - self.update() + prediction = self._data_forward(model, batch_x) + loss = self._compute_loss(prediction, batch_y) + self._grad_backward(loss) + self._update() self._summary_writer.add_scalar("loss", loss.item(), global_step=self.step) for name, param in self.model.named_parameters(): if param.requires_grad: @@ -162,18 +171,18 @@ def _train_epoch(self, data_iterator, model, epoch, dev_data, start, **kwargs): print(print_output) if self.validate_every > 0 and self.step % self.validate_every == 0: - self.do_validation() + self._do_validation() self.step += 1 - def do_validation(self): + def _do_validation(self): res = self.tester.test() for name, num in res.items(): self._summary_writer.add_scalar("valid_{}".format(name), num, global_step=self.step) - if self.save_path is not None and self.best_eval_result(res): + if self.save_path is not None and self._better_eval_result(res): self.save_model(self.model, 'best_model_' + self.start_time) - def mode(self, model, is_test=False): + def _mode(self, model, is_test=False): """Train mode or Test mode. This is for PyTorch currently. :param model: a PyTorch model @@ -185,20 +194,20 @@ def mode(self, model, is_test=False): else: model.train() - def update(self): + def _update(self): """Perform weight update on a model. """ self.optimizer.step() - def data_forward(self, network, x): + def _data_forward(self, network, x): x = _build_args(network.forward, **x) y = network(**x) if not isinstance(y, dict): raise TypeError(f"The return value of {get_func_signature(network.forward)} should be dict, got {type(y)}.") return y - def grad_backward(self, loss): + def _grad_backward(self, loss): """Compute gradient with link rules. :param loss: a scalar where back-prop starts @@ -208,7 +217,7 @@ def grad_backward(self, loss): self.model.zero_grad() loss.backward() - def get_loss(self, predict, truth): + def _compute_loss(self, predict, truth): """Compute loss given prediction and ground truth. :param predict: prediction dict, produced by model.forward @@ -224,27 +233,52 @@ def save_model(self, model, model_name, only_param=False): else: torch.save(model, model_name) - def best_eval_result(self, metrics): + def _better_eval_result(self, metrics): """Check if the current epoch yields better validation results. - :return: bool, True means current results on dev set is the best. + :return bool value: True means current results on dev set is the best. """ if isinstance(metrics, tuple): loss, metrics = metrics if isinstance(metrics, dict): if len(metrics) == 1: - accuracy = list(metrics.values())[0] + # only single metric, just use it + metric_dict = list(metrics.values())[0] + metrics_name = list(metrics.keys())[0] else: - accuracy = metrics[self.eval_sort_key] - else: - accuracy = metrics - - if accuracy > self._best_accuracy: - self._best_accuracy = accuracy - return True - else: - return False + metrics_name = self.metrics[0].__class__.__name__ + if metrics_name not in metrics: + raise RuntimeError(f"{metrics_name} is chosen to do validation, but got {metrics}") + metric_dict = metrics[metrics_name] + + if len(metric_dict) == 1: + indicator_val, indicator = list(metric_dict.values())[0], list(metric_dict.keys())[0] + elif len(metric_dict) > 1 and self.metric_key is None: + raise RuntimeError( + f"Got multiple metric keys: {metric_dict}, but metric_key is not set. Which one to use?") + else: + # metric_key is set + if self.metric_key not in metric_dict: + raise RuntimeError(f"matric key {self.metric_key} not found in {metric_dict}") + indicator_val = metric_dict[self.metric_key] + + is_better = True + if self.best_metric_indicator is None: + # first-time validation + self.best_metric_indicator = indicator_val + else: + if self.increase_better is True: + if indicator_val > self.best_metric_indicator: + self.best_metric_indicator = indicator_val + else: + is_better = False + else: + if indicator_val < self.best_metric_indicator: + self.best_metric_indicator = indicator_val + else: + is_better = False + return is_better DEFAULT_CHECK_BATCH_SIZE = 2 @@ -254,6 +288,7 @@ def best_eval_result(self, metrics): WARNING_CHECK_LEVEL = 1 STRICT_CHECK_LEVEL = 2 + def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, dev_data=None, check_level=WARNING_CHECK_LEVEL): @@ -264,7 +299,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ for batch_count, (batch_x, batch_y) in enumerate(batch): _move_dict_value_to_device(model_devcie, batch_x, batch_y) # forward check - if batch_count==0: + if batch_count == 0: _check_forward_error(model_func=model.forward, check_level=check_level, batch_x=batch_x) @@ -285,17 +320,17 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ if batch_count == 0: if not isinstance(loss, torch.Tensor): raise TypeError(f"The return value of {get_func_signature(losser.__call__)} should be `torch.Tensor`, " - f"but got `{type(loss)}`.") - if len(loss.size())!=0: + f"but got `{type(loss)}`.") + if len(loss.size()) != 0: raise ValueError(f"The size of return value of {get_func_signature(losser.__call__)} is {loss.size()}, " f"should be torch.size([])") loss.backward() model.zero_grad() - if batch_count+1>=DEFAULT_CHECK_NUM_BATCH: + if batch_count + 1 >= DEFAULT_CHECK_NUM_BATCH: break if dev_data is not None: - tester = Tester(data=dataset[:batch_size*DEFAULT_CHECK_NUM_BATCH], model=model, metrics=metrics, + tester = Tester(data=dataset[:batch_size * DEFAULT_CHECK_NUM_BATCH], model=model, metrics=metrics, batch_size=batch_size, verbose=-1) tester.test() @@ -305,18 +340,18 @@ def _check_forward_error(model_func, check_level, batch_x): _missing = '' _unused = '' func_signature = get_func_signature(model_func) - if len(check_res['missing'])!=0: + if len(check_res['missing']) != 0: _missing = "Function {} misses {}, only provided with {}, " \ ".\n".format(func_signature, check_res.missing, - list(batch_x.keys())) - if len(check_res['unused'])!=0: + list(batch_x.keys())) + if len(check_res['unused']) != 0: if len(check_res.unused) > 1: _unused = "{} are not used ".format(check_res.unused) else: _unused = "{} is not used ".format(check_res.unused) _unused += "in function {}.\n".format(func_signature) if _missing: - if len(_unused)>0 and STRICT_CHECK_LEVEL: + if len(_unused) > 0 and STRICT_CHECK_LEVEL: _error_str = "(1).{}\n(2).{}".format(_missing, _unused) else: _error_str = _missing @@ -329,38 +364,40 @@ def _check_forward_error(model_func, check_level, batch_x): elif check_level == WARNING_CHECK_LEVEL: warnings.warn(message=_unused) -def _check_loss_evaluate(prev_func, func, check_res, output, batch_y, check_level): + +def _check_loss_evaluate(prev_func, func, check_level, output, batch_y): + check_res = _check_arg_dict_list(func, [output, batch_y]) _missing = '' _unused = '' _duplicated = '' func_signature = get_func_signature(func) prev_func_signature = get_func_signature(prev_func) - if len(check_res.missing)>0: + if len(check_res.missing) > 0: _missing = "function {} misses argument {}, \n\t only provided with {}(from {}) and " \ "{}(from target in Dataset)." \ - .format(func_signature, check_res.missing, - list(output.keys()), prev_func_signature, - list(batch_y.keys())) - if len(check_res.unused)>0: + .format(func_signature, check_res.missing, + list(output.keys()), prev_func_signature, + list(batch_y.keys())) + if len(check_res.unused) > 0: if len(check_res.unused) > 1: _unused = "{} are not used ".format(check_res.unused) else: _unused = "{} is not used ".format(check_res.unused) _unused += "in function {}.\n".format(func_signature) - if len(check_res.duplicated)>0: + if len(check_res.duplicated) > 0: if len(check_res.duplicated) > 1: _duplicated = "duplicated keys {} are detected when calling function {}. \n\tDon't set {} as target and output " \ "them in {} at the same time.".format(check_res.duplicated, - func_signature, - check_res.duplicated, - prev_func_signature) - else: - _duplicated = "duplicated key {} is detected when calling function {}. \n\tDon't set {} as target and output " \ - "it in {} at the same time.".format(check_res.duplicated, func_signature, check_res.duplicated, prev_func_signature) - _number_errs = int(len(_missing)!=0) + int(len(_duplicated)!=0) + int(len(_unused)!=0) + else: + _duplicated = "duplicated key {} is detected when calling function {}. \n\tDon't set {} as target and output " \ + "it in {} at the same time.".format(check_res.duplicated, + func_signature, + check_res.duplicated, + prev_func_signature) + _number_errs = int(len(_missing) != 0) + int(len(_duplicated) != 0) + int(len(_unused) != 0) if _number_errs > 0: _error_strs = [] if _number_errs > 1: From e5e7f29d7205a269fd1a922bfd9067f2ead5de81 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sat, 1 Dec 2018 20:27:23 +0800 Subject: [PATCH 14/67] =?UTF-8?q?=E6=9B=B4=E6=96=B0Trainer:=20*=20?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0Trainer=E5=8F=82=E6=95=B0metric=5Fkey?= =?UTF-8?q?=EF=BC=8C=E6=8C=87=E6=98=8E=E7=94=A8=E6=9D=A5=E5=81=9A=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E9=80=89=E6=8B=A9=E7=9A=84=E6=8C=87=E6=A0=87=E7=9A=84?= =?UTF-8?q?=E5=90=8D=E5=AD=97=20*=20=E5=9C=A8Trainer=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E5=A4=84=E7=90=86tester=E8=BF=94=E5=9B=9E=E7=9A=84=E8=AF=84?= =?UTF-8?q?=E4=BB=B7=E6=8C=87=E6=A0=87=E7=9A=84=E9=80=BB=E8=BE=91=EF=BC=8C?= =?UTF-8?q?=E9=80=89=E6=8B=A9=E5=BD=93=E5=89=8D=E6=9C=80=E5=A5=BD=E7=9A=84?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/trainer.py | 168 ++++++++++++++++++++++++---------------- 1 file changed, 102 insertions(+), 66 deletions(-) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index da8e54f9..d4bedb6f 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -1,39 +1,38 @@ -import itertools import os import time import warnings -from collections import defaultdict from datetime import datetime from datetime import timedelta import torch -from torch import nn from tensorboardX import SummaryWriter +from torch import nn from fastNLP.core.batch import Batch +from fastNLP.core.dataset import DataSet +from fastNLP.core.losses import _prepare_losser +from fastNLP.core.metrics import _prepare_metrics from fastNLP.core.optimizer import Optimizer from fastNLP.core.sampler import RandomSampler from fastNLP.core.sampler import SequentialSampler from fastNLP.core.tester import Tester +from fastNLP.core.utils import CheckError from fastNLP.core.utils import _build_args from fastNLP.core.utils import _check_arg_dict_list from fastNLP.core.utils import _move_dict_value_to_device from fastNLP.core.utils import get_func_signature -from fastNLP.core.dataset import DataSet -from fastNLP.core.losses import LossBase -from fastNLP.core.metrics import MetricBase -from fastNLP.core.losses import _prepare_losser -from fastNLP.core.metrics import _prepare_metrics -from fastNLP.core.utils import CheckError class Trainer(object): """Main Training Loop """ - def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, print_every=-1, validate_every=-1, + + def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, print_every=-1, + validate_every=-1, dev_data=None, use_cuda=False, save_path="./save", optimizer=Optimizer("Adam", lr=0.01, weight_decay=0), need_check_code=True, + metric_key=None, **kwargs): super(Trainer, self).__init__() @@ -50,6 +49,13 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat # prepare evaluate metrics = _prepare_metrics(metrics) + + # parse metric_key + # increase_better is True. It means the exp result gets better if the indicator increases. + # It is true by default. + self.increase_better = False if metric_key[0] == "-" else True + self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key + # prepare loss losser = _prepare_losser(losser) @@ -67,12 +73,10 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat self.save_path = save_path self.print_every = int(print_every) self.validate_every = int(validate_every) - self._best_accuracy = 0 + self.best_metric_indicator = None self._model_device = model.parameters().__next__().device - # TODO self._best_accuracy不能表现出当前的metric多种的情况 - if isinstance(optimizer, torch.optim.Optimizer): self.optimizer = optimizer else: @@ -102,7 +106,7 @@ def train(self): if torch.cuda.is_available() and self.use_cuda: self.model = self.model.cuda() - self.mode(self.model, is_test=False) + self._mode(self.model, is_test=False) start = time.time() self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) @@ -112,7 +116,9 @@ class psudoSW: def __getattr__(self, item): def pass_func(*args, **kwargs): pass + return pass_func + self._summary_writer = psudoSW() else: path = os.path.join(self.save_path, 'tensorboard_logs_{}'.format(self.start_time)) @@ -121,19 +127,20 @@ def pass_func(*args, **kwargs): epoch = 1 while epoch <= self.n_epochs: - data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=RandomSampler(), as_numpy=False) + data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=RandomSampler(), + as_numpy=False) - self._train_epoch(data_iterator, self.model, epoch, self.dev_data, start) + self._train_epoch(data_iterator, self.model, epoch, start) # validate_every override validation at end of epochs if self.dev_data and self.validate_every <= 0: - self.do_validation() + self._do_validation() epoch += 1 finally: self._summary_writer.close() del self._summary_writer - def _train_epoch(self, data_iterator, model, epoch, dev_data, start, **kwargs): + def _train_epoch(self, data_iterator, model, epoch, start): """Training process in one epoch. kwargs should contain: @@ -144,10 +151,10 @@ def _train_epoch(self, data_iterator, model, epoch, dev_data, start, **kwargs): for batch_x, batch_y in data_iterator: # TODO 这里可能会遇到问题,万一用户在model内部修改了prediction的device就会有问题 _move_dict_value_to_device(self._model_device, batch_x, batch_y) - prediction = self.data_forward(model, batch_x) - loss = self.get_loss(prediction, batch_y) - self.grad_backward(loss) - self.update() + prediction = self._data_forward(model, batch_x) + loss = self._compute_loss(prediction, batch_y) + self._grad_backward(loss) + self._update() self._summary_writer.add_scalar("loss", loss.item(), global_step=self.step) for name, param in self.model.named_parameters(): if param.requires_grad: @@ -162,18 +169,19 @@ def _train_epoch(self, data_iterator, model, epoch, dev_data, start, **kwargs): print(print_output) if self.validate_every > 0 and self.step % self.validate_every == 0: - self.do_validation() + self._do_validation() self.step += 1 - def do_validation(self): + def _do_validation(self): res = self.tester.test() for name, num in res.items(): self._summary_writer.add_scalar("valid_{}".format(name), num, global_step=self.step) - if self.save_path is not None and self.best_eval_result(res): - self.save_model(self.model, 'best_model_' + self.start_time) + if self.save_path is not None and self._better_eval_result(res): + self.save_model(self.model, + "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time])) - def mode(self, model, is_test=False): + def _mode(self, model, is_test=False): """Train mode or Test mode. This is for PyTorch currently. :param model: a PyTorch model @@ -185,20 +193,20 @@ def mode(self, model, is_test=False): else: model.train() - def update(self): + def _update(self): """Perform weight update on a model. """ self.optimizer.step() - def data_forward(self, network, x): + def _data_forward(self, network, x): x = _build_args(network.forward, **x) y = network(**x) if not isinstance(y, dict): raise TypeError(f"The return value of {get_func_signature(network.forward)} should be dict, got {type(y)}.") return y - def grad_backward(self, loss): + def _grad_backward(self, loss): """Compute gradient with link rules. :param loss: a scalar where back-prop starts @@ -208,7 +216,7 @@ def grad_backward(self, loss): self.model.zero_grad() loss.backward() - def get_loss(self, predict, truth): + def _compute_loss(self, predict, truth): """Compute loss given prediction and ground truth. :param predict: prediction dict, produced by model.forward @@ -224,27 +232,52 @@ def save_model(self, model, model_name, only_param=False): else: torch.save(model, model_name) - def best_eval_result(self, metrics): + def _better_eval_result(self, metrics): """Check if the current epoch yields better validation results. - :return: bool, True means current results on dev set is the best. + :return bool value: True means current results on dev set is the best. """ if isinstance(metrics, tuple): loss, metrics = metrics if isinstance(metrics, dict): if len(metrics) == 1: - accuracy = list(metrics.values())[0] + # only single metric, just use it + metric_dict = list(metrics.values())[0] + metrics_name = list(metrics.keys())[0] else: - accuracy = metrics[self.eval_sort_key] - else: - accuracy = metrics - - if accuracy > self._best_accuracy: - self._best_accuracy = accuracy - return True - else: - return False + metrics_name = self.metrics[0].__class__.__name__ + if metrics_name not in metrics: + raise RuntimeError(f"{metrics_name} is chosen to do validation, but got {metrics}") + metric_dict = metrics[metrics_name] + + if len(metric_dict) == 1: + indicator_val, indicator = list(metric_dict.values())[0], list(metric_dict.keys())[0] + elif len(metric_dict) > 1 and self.metric_key is None: + raise RuntimeError( + f"Got multiple metric keys: {metric_dict}, but metric_key is not set. Which one to use?") + else: + # metric_key is set + if self.metric_key not in metric_dict: + raise RuntimeError(f"matric key {self.metric_key} not found in {metric_dict}") + indicator_val = metric_dict[self.metric_key] + + is_better = True + if self.best_metric_indicator is None: + # first-time validation + self.best_metric_indicator = indicator_val + else: + if self.increase_better is True: + if indicator_val > self.best_metric_indicator: + self.best_metric_indicator = indicator_val + else: + is_better = False + else: + if indicator_val < self.best_metric_indicator: + self.best_metric_indicator = indicator_val + else: + is_better = False + return is_better DEFAULT_CHECK_BATCH_SIZE = 2 @@ -254,6 +287,7 @@ def best_eval_result(self, metrics): WARNING_CHECK_LEVEL = 1 STRICT_CHECK_LEVEL = 2 + def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, dev_data=None, check_level=WARNING_CHECK_LEVEL): @@ -264,7 +298,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ for batch_count, (batch_x, batch_y) in enumerate(batch): _move_dict_value_to_device(model_devcie, batch_x, batch_y) # forward check - if batch_count==0: + if batch_count == 0: _check_forward_error(model_func=model.forward, check_level=check_level, batch_x=batch_x) @@ -285,17 +319,17 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ if batch_count == 0: if not isinstance(loss, torch.Tensor): raise TypeError(f"The return value of {get_func_signature(losser.__call__)} should be `torch.Tensor`, " - f"but got `{type(loss)}`.") - if len(loss.size())!=0: + f"but got `{type(loss)}`.") + if len(loss.size()) != 0: raise ValueError(f"The size of return value of {get_func_signature(losser.__call__)} is {loss.size()}, " f"should be torch.size([])") loss.backward() model.zero_grad() - if batch_count+1>=DEFAULT_CHECK_NUM_BATCH: + if batch_count + 1 >= DEFAULT_CHECK_NUM_BATCH: break if dev_data is not None: - tester = Tester(data=dataset[:batch_size*DEFAULT_CHECK_NUM_BATCH], model=model, metrics=metrics, + tester = Tester(data=dataset[:batch_size * DEFAULT_CHECK_NUM_BATCH], model=model, metrics=metrics, batch_size=batch_size, verbose=-1) tester.test() @@ -305,18 +339,18 @@ def _check_forward_error(model_func, check_level, batch_x): _missing = '' _unused = '' func_signature = get_func_signature(model_func) - if len(check_res['missing'])!=0: + if len(check_res['missing']) != 0: _missing = "Function {} misses {}, only provided with {}, " \ ".\n".format(func_signature, check_res.missing, - list(batch_x.keys())) - if len(check_res['unused'])!=0: + list(batch_x.keys())) + if len(check_res['unused']) != 0: if len(check_res.unused) > 1: _unused = "{} are not used ".format(check_res.unused) else: _unused = "{} is not used ".format(check_res.unused) _unused += "in function {}.\n".format(func_signature) if _missing: - if len(_unused)>0 and STRICT_CHECK_LEVEL: + if len(_unused) > 0 and STRICT_CHECK_LEVEL: _error_str = "(1).{}\n(2).{}".format(_missing, _unused) else: _error_str = _missing @@ -329,38 +363,40 @@ def _check_forward_error(model_func, check_level, batch_x): elif check_level == WARNING_CHECK_LEVEL: warnings.warn(message=_unused) -def _check_loss_evaluate(prev_func, func, check_res, output, batch_y, check_level): + +def _check_loss_evaluate(prev_func, func, check_level, output, batch_y): + check_res = _check_arg_dict_list(func, [output, batch_y]) _missing = '' _unused = '' _duplicated = '' func_signature = get_func_signature(func) prev_func_signature = get_func_signature(prev_func) - if len(check_res.missing)>0: + if len(check_res.missing) > 0: _missing = "function {} misses argument {}, \n\t only provided with {}(from {}) and " \ "{}(from target in Dataset)." \ - .format(func_signature, check_res.missing, - list(output.keys()), prev_func_signature, - list(batch_y.keys())) - if len(check_res.unused)>0: + .format(func_signature, check_res.missing, + list(output.keys()), prev_func_signature, + list(batch_y.keys())) + if len(check_res.unused) > 0: if len(check_res.unused) > 1: _unused = "{} are not used ".format(check_res.unused) else: _unused = "{} is not used ".format(check_res.unused) _unused += "in function {}.\n".format(func_signature) - if len(check_res.duplicated)>0: + if len(check_res.duplicated) > 0: if len(check_res.duplicated) > 1: _duplicated = "duplicated keys {} are detected when calling function {}. \n\tDon't set {} as target and output " \ "them in {} at the same time.".format(check_res.duplicated, - func_signature, - check_res.duplicated, - prev_func_signature) - else: - _duplicated = "duplicated key {} is detected when calling function {}. \n\tDon't set {} as target and output " \ - "it in {} at the same time.".format(check_res.duplicated, func_signature, check_res.duplicated, prev_func_signature) - _number_errs = int(len(_missing)!=0) + int(len(_duplicated)!=0) + int(len(_unused)!=0) + else: + _duplicated = "duplicated key {} is detected when calling function {}. \n\tDon't set {} as target and output " \ + "it in {} at the same time.".format(check_res.duplicated, + func_signature, + check_res.duplicated, + prev_func_signature) + _number_errs = int(len(_missing) != 0) + int(len(_duplicated) != 0) + int(len(_unused) != 0) if _number_errs > 0: _error_strs = [] if _number_errs > 1: From 8a7077fed259b0f7ce216bdf82f2999f2a90f17e Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sat, 1 Dec 2018 22:21:57 +0800 Subject: [PATCH 15/67] =?UTF-8?q?=E6=9B=B4=E6=96=B0Optimizer:=20optimizer.?= =?UTF-8?q?SGD(lr=3Dxxx);=E5=A6=82=E6=9E=9C=E6=B2=A1=E6=9C=89=E4=BC=A0?= =?UTF-8?q?=E5=85=A5parameters=EF=BC=8C=E5=88=99=E5=9C=A8trainer=E4=B8=AD?= =?UTF-8?q?=E5=B8=AE=E4=BB=96=E5=8A=A0=E5=85=A5parameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/optimizer.py | 69 ++++++++++--------------------------- fastNLP/core/trainer.py | 8 ++--- test/core/test_optimizer.py | 21 +++++++++++ test/core/test_trainer.py | 1 + 4 files changed, 44 insertions(+), 55 deletions(-) create mode 100644 test/core/test_optimizer.py diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py index ff2ee40e..72737b81 100644 --- a/fastNLP/core/optimizer.py +++ b/fastNLP/core/optimizer.py @@ -2,61 +2,28 @@ class Optimizer(object): - """Wrapper of optimizer from framework + def __init__(self, model_params, **kwargs): + if model_params is not None and not isinstance(model_params, torch.Tensor): + raise RuntimeError("model parameters should be torch.Tensor, rather than {}".format(type(model_params))) + self.model_params = model_params + self.settings = kwargs - 1. Adam: lr (float), weight_decay (float) - 2. AdaGrad - 3. RMSProp - 4. SGD: lr (float), momentum (float) - """ +class SGD(Optimizer): + def __init__(self, model_params=None, lr=0.001, momentum=0.9): + super(SGD, self).__init__(model_params, lr=lr, momentum=momentum) - def __init__(self, optimizer_name, **kwargs): - """ - :param optimizer_name: str, the name of the optimizer - :param kwargs: the arguments - - """ - self.optim_name = optimizer_name - self.kwargs = kwargs - - @property - def name(self): - """The name of the optimizer. - - :return: str - """ - return self.optim_name + def construct_from_pytorch(self, model_params): + if self.model_params is None: + self.model_params = model_params + return torch.optim.SGD(self.model_params, **self.settings) - @property - def params(self): - """The arguments used to create the optimizer. - :return: dict of (str, *) - """ - return self.kwargs +class Adam(Optimizer): + def __init__(self, model_params=None, lr=0.001, weight_decay=0.8): + super(Adam, self).__init__(model_params, lr=lr, weight_decay=weight_decay) def construct_from_pytorch(self, model_params): - """Construct a optimizer from framework over given model parameters.""" - - if self.optim_name in ["SGD", "sgd"]: - if "lr" in self.kwargs: - if "momentum" not in self.kwargs: - self.kwargs["momentum"] = 0 - optimizer = torch.optim.SGD(model_params, lr=self.kwargs["lr"], momentum=self.kwargs["momentum"]) - else: - raise ValueError("requires learning rate for SGD optimizer") - - elif self.optim_name in ["adam", "Adam"]: - if "lr" in self.kwargs: - if "weight_decay" not in self.kwargs: - self.kwargs["weight_decay"] = 0 - optimizer = torch.optim.Adam(model_params, lr=self.kwargs["lr"], - weight_decay=self.kwargs["weight_decay"]) - else: - raise ValueError("requires learning rate for Adam optimizer") - - else: - raise NotImplementedError - - return optimizer + if self.model_params is None: + self.model_params = model_params + return torch.optim.Adam(self.model_params, **self.settings) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index d4bedb6f..fb9ba25b 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -12,7 +12,7 @@ from fastNLP.core.dataset import DataSet from fastNLP.core.losses import _prepare_losser from fastNLP.core.metrics import _prepare_metrics -from fastNLP.core.optimizer import Optimizer +from fastNLP.core.optimizer import Adam from fastNLP.core.sampler import RandomSampler from fastNLP.core.sampler import SequentialSampler from fastNLP.core.tester import Tester @@ -31,7 +31,7 @@ class Trainer(object): def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, print_every=-1, validate_every=-1, dev_data=None, use_cuda=False, save_path="./save", - optimizer=Optimizer("Adam", lr=0.01, weight_decay=0), need_check_code=True, + optimizer=Adam(lr=0.01, weight_decay=0), need_check_code=True, metric_key=None, **kwargs): super(Trainer, self).__init__() @@ -178,7 +178,7 @@ def _do_validation(self): for name, num in res.items(): self._summary_writer.add_scalar("valid_{}".format(name), num, global_step=self.step) if self.save_path is not None and self._better_eval_result(res): - self.save_model(self.model, + self._save_model(self.model, "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time])) def _mode(self, model, is_test=False): @@ -225,7 +225,7 @@ def _compute_loss(self, predict, truth): """ return self.losser(predict, truth) - def save_model(self, model, model_name, only_param=False): + def _save_model(self, model, model_name, only_param=False): model_name = os.path.join(self.save_path, model_name) if only_param: torch.save(model.state_dict(), model_name) diff --git a/test/core/test_optimizer.py b/test/core/test_optimizer.py new file mode 100644 index 00000000..26e47d43 --- /dev/null +++ b/test/core/test_optimizer.py @@ -0,0 +1,21 @@ +import unittest + +import torch + +from fastNLP.core.optimizer import SGD + + +class TestOptim(unittest.TestCase): + def test_case(self): + optim = SGD(torch.LongTensor(10)) + print(optim.__dict__) + + optim_2 = SGD(lr=0.001) + print(optim_2.__dict__) + + optim_2 = SGD(lr=0.002, momentum=0.989) + print(optim_2.__dict__) + + def test_case_2(self): + with self.assertRaises(RuntimeError): + _ = SGD(0.001) diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index 7c0a1a9d..08df6a49 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -4,3 +4,4 @@ class TestTrainer(unittest.TestCase): def test_case_1(self): pass + From 6d36190be4a221234372e58fd9e45bd03d6a0416 Mon Sep 17 00:00:00 2001 From: xuyige Date: Sat, 1 Dec 2018 22:44:24 +0800 Subject: [PATCH 16/67] update LossBase class --- fastNLP/core/losses.py | 100 ++++++++++++++++++++++++++++++----------- test/core/test_loss.py | 74 +++++++++++++++++++++++++++--- 2 files changed, 143 insertions(+), 31 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index aa1ffb89..66664859 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -1,23 +1,29 @@ import torch +import torch.nn.functional as F +from fastNLP.core.utils import CheckError +from fastNLP.core.utils import CheckRes from fastNLP.core.utils import _get_arg_list from fastNLP.core.utils import _map_args from fastNLP.core.utils import get_func_signature from fastNLP.core.utils import _build_args +from fastNLP.core.utils import _check_function_or_method class LossBase(object): def __init__(self): # key: name in target function; value: name in output function self.param_map = {} + self._checked = False def get_loss(self, *args, **kwargs): raise NotImplementedError - def __call__(self, output_dict, target_dict): + def __call__(self, output_dict, target_dict, force_check=False): """ :param output_dict: A dict from forward function of the network. :param target_dict: A dict from DataSet.batch_y. + :param force_check: Boolean. Force to check the mapping functions when it is running. :return: """ args, defaults, defaults_val, varargs, kwargs = _get_arg_list(self.get_loss) @@ -27,50 +33,94 @@ def __call__(self, output_dict, target_dict): ) param_map = self.param_map - for keys in args: - if keys not in param_map: - param_map.update({keys: keys}) - for keys in defaults: - if keys not in param_map: - param_map.update({keys: keys}) + if args is None: + raise RuntimeError( + f"There is not any param in function{get_func_signature(self.get_loss)}" + ) + self._checked = self._checked and not force_check + if not self._checked: + for keys in args: + if keys not in param_map: + param_map.update({keys: keys}) + if defaults is not None: + for keys in defaults: + if keys not in param_map: + param_map.update({keys: keys}) + self.param_map = param_map # param map: key= name in get_loss function, value= name in param dict - reversed_param_map = {val: key for key, val in param_map} + reversed_param_map = {val: key for key, val in param_map.items()} # reversed param map: key= name in param dict, value= name in get_loss function + duplicated = [] + missing = [] + if not self._checked: + for keys, val in output_dict.items(): + if keys in target_dict.keys(): + duplicated.append(keys) + param_val_dict = {} for keys, val in output_dict.items(): - if keys not in target_dict.keys(): - param_val_dict.update({keys: val}) - else: - raise RuntimeError("conflict Error in output dict and target dict with name {}".format(keys)) + param_val_dict.update({keys: val}) for keys, val in target_dict.items(): - if keys not in output_dict.keys(): - param_val_dict.update({keys: val}) - else: - raise RuntimeError("conflict Error in output dict and target dict with name {}".format(keys)) + param_val_dict.update({keys: val}) - for keys in args: - if param_map[keys] not in param_val_dict.keys(): - raise RuntimeError(f"missing param {keys} in function {get_func_signature(self.get_loss)}") + if not self._checked: + for keys in args: + if param_map[keys] not in param_val_dict.keys(): + missing.append(keys) + + if len(duplicated) > 0 or len(missing) > 0: + raise CheckError( + CheckRes(missing=missing, unused=[], duplicated=duplicated, required=[], all_needed=[]), + func_signature=get_func_signature(self.get_loss) + ) + + self._checked = True param_map_val = _map_args(reversed_param_map, **param_val_dict) - param_value = _build_args(**param_map_val) + param_value = _build_args(self.get_loss, **param_map_val) loss = self.get_loss(**param_value) if not (isinstance(loss, torch.Tensor) and len(loss.size()) == 0): if not isinstance(loss, torch.Tensor): - raise RuntimeError("loss ERROR: loss except a torch.Tensor but get {}".format(type(loss))) - raise RuntimeError("loss ERROR: len(loss.size()) except 0 but got {}".format(len(loss.size()))) + raise RuntimeError(f"loss ERROR: loss except a torch.Tensor but get {type(loss)}") + raise RuntimeError(f"loss ERROR: the size of loss except torch.Size([]) but got {loss.size}") return loss class NewLoss(LossBase): def __init__(self, func, key_map=None, **kwargs): - super(NewLoss).__init__() - if not callable(func): - raise RuntimeError("") + super(NewLoss, self).__init__() + _check_function_or_method(func) + if key_map is not None: + if not isinstance(key_map, dict): + raise RuntimeError(f"Loss error: key_map except a {type({})} but got a {type(key_map)}") + self.param_map = key_map + if len(kwargs) > 0: + for key, val in kwargs.items(): + self.param_map.update({key: val}) + + self.get_loss = func + + +class L1Loss(LossBase): + def __init__(self): + super(L1Loss, self).__init__() + self.get_loss = F.l1_loss + + +class BCELoss(LossBase): + def __init__(self): + super(BCELoss, self).__init__() + self.get_loss = F.binary_cross_entropy + + +class NLLLoss(LossBase): + def __init__(self): + super(NLLLoss, self).__init__() + self.get_loss = F.nll_loss class LossInForward(LossBase): diff --git a/test/core/test_loss.py b/test/core/test_loss.py index fdde4f0e..fddc56e9 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -2,6 +2,7 @@ import unittest import torch as tc +import torch.nn.functional as F import fastNLP.core.losses as loss @@ -13,7 +14,11 @@ def test_case_1(self): print (".----------------------------------") - loss_func = loss.Loss("nll") + # loss_func = loss.Loss("nll") + print(callable(tc.nn.NLLLoss)) + loss_func = loss.NewLoss(F.nll_loss) + + nll_loss = loss.NLLLoss() #pdb.set_trace() @@ -35,16 +40,18 @@ def test_case_1(self): y = tc.log(y) - los = loss_func(y , gy) + los = loss_func({'input': y}, {'target': gy}) + losses = nll_loss({'input': y}, {'target': gy}) r = -math.log(.3) - math.log(.3) - math.log(.1) r /= 3 print ("loss = %f" % (los)) print ("r = %f" % (r)) + print ("nll_loss = %f" % (losses)) self.assertEqual(int(los * 1000), int(r * 1000)) - def test_case_2(self): + def _test_case_2(self): #验证squash()的正确性 print ("----------------------------------") @@ -74,7 +81,8 @@ def test_case_2(self): #pdb.set_trace() y = tc.log(y) - los = loss_func(y , gy) + #los = loss_func({'input': y}, {'target': gy}) + los = loss_func(y, gy) print ("loss = %f" % (los)) r = -log(.3) - log(.3) - log(.1) - log(.3) - log(.7) - log(.1) @@ -89,7 +97,8 @@ def test_case_3(self): log = math.log - loss_func = loss.Loss("nll") + #loss_func = loss.Loss("nll") + loss_func = loss.NLLLoss() #pdb.set_trace() @@ -117,7 +126,7 @@ def test_case_3(self): yy = tc.nn.utils.rnn.pack_padded_sequence(y , lens , batch_first = True).data gyy = tc.nn.utils.rnn.pack_padded_sequence(gy , lens , batch_first = True).data - los = loss_func(yy , gyy) + los = loss_func({'input': yy}, {'target': gyy}) print ("loss = %f" % (los)) @@ -303,5 +312,58 @@ def test_case_7(self): print ("r = %f" % (r)) self.assertEqual(int(los * 1000), int(r * 1000)) + def test_case_8(self): + def func(a, b): + import torch.nn.functional as F + return F.cross_entropy(a, b) + + def func2(a, truth): + return func(a, truth) + + def func3(predict, truth): + return func(predict, truth) + + def func4(a, b, c=2): + return (a + b) * c + + def func6(a, b, **kwargs): + c = kwargs['c'] + return (a + b) * c + + import torch + from fastNLP.core.losses import LossBase, NewLoss + + get_loss = NewLoss(func, {'a': 'predict', 'b': 'truth'}) + predict = torch.randn(5, 3) + truth = torch.LongTensor([1, 0, 1, 2, 1]) + loss1 = get_loss({'predict': predict}, {'truth': truth}) + get_loss_2 = NewLoss(func2, {'a': 'predict'}) + loss2 = get_loss_2({'predict': predict}, {'truth': truth}) + get_loss_3 = NewLoss(func3) + loss3 = get_loss_3({'predict': predict}, {'truth': truth}) + print(loss1, loss2, loss3) + assert loss1 == loss2 and loss1 == loss3 + + get_loss_4 = NewLoss(func4) + loss4 = get_loss_4({'a': 1, 'b': 3}, {}) + print(loss4) + assert loss4 == (1 + 3) * 2 + + get_loss_5 = NewLoss(func4) + loss5 = get_loss_5({'a': 1, 'b': 3}, {'c': 4}) + print(loss5) + assert loss5 == (1 + 3) * 4 + + get_loss_6 = NewLoss(func6) + loss6 = get_loss_6({'a': 1, 'b': 3}, {'c': 4}) + print(loss6) + assert loss6 == (1 + 3) * 4 + + get_loss_7 = NewLoss(func6, c='cc') + loss7 = get_loss_7({'a': 1, 'b': 3}, {'cc': 4}) + print(loss7) + assert loss7 == (1 + 3) * 4 + + if __name__ == "__main__": unittest.main() From 3a4a7293144e714460ff70f65d10664b5efc9a3d Mon Sep 17 00:00:00 2001 From: yh Date: Sat, 1 Dec 2018 23:43:24 +0800 Subject: [PATCH 17/67] trainer and tester change check_code --- fastNLP/core/metrics.py | 15 +++-- fastNLP/core/tester.py | 6 +- fastNLP/core/trainer.py | 130 +++++++--------------------------------- fastNLP/core/utils.py | 77 +++++++++++++++++++++--- 4 files changed, 103 insertions(+), 125 deletions(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 60e0d82f..69bb540d 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -1,6 +1,7 @@ import warnings import inspect +from collections import defaultdict import numpy as np import torch @@ -21,6 +22,7 @@ def evaluate(self, *args, **kwargs): def _init_param_map(self, key_map, **kwargs): self.param_map = {} + value_counter = defaultdict(0) for key, value in key_map.items(): if isinstance(key, str): raise TypeError(f"key in key_map must be `str`, not `{type(key)}`.") @@ -32,16 +34,19 @@ def _init_param_map(self, key_map, **kwargs): raise TypeError(f"in {key}={value}, value must be `str`, not `{type(value)}`.") self.param_map[key] = value - def __call__(self, output_dict, target_dict, force_check=False): + def __call__(self, output_dict, target_dict, check=False): """ :param output_dict: :param target_dict: + :param check: boolean, :return: """ if not callable(self.evaluate): raise TypeError(f"{self.__class__.__name__}.evaluate has to be callable, not {type(self.evaluate)}.") if not self._checked: + # 0. check param_map does not have same value + # 1. check consistence between signature and param_map func_spect = inspect.getfullargspec(self.evaluate) func_args = func_spect.args @@ -65,7 +70,7 @@ def __call__(self, output_dict, target_dict, force_check=False): mapped_target_dict[func_arg] = target_dict[input_arg] # check duplicated, unused, missing - if force_check or not self._checked: + if check or not self._checked: check_res = _check_arg_dict_list(self.evaluate, [mapped_output_dict, mapped_output_dict]) self._reverse_param_map = {value:key for key, value in check_res.items()} for key, value in check_res.items(): @@ -73,8 +78,9 @@ def __call__(self, output_dict, target_dict, force_check=False): for idx, func_param in enumerate(value): if func_param in self._reverse_param_map: new_value[idx] = self._reverse_param_map[func_param] - if check_res.missing or check_res.duplicated: - raise CheckError(check_res=check_res) + if check_res.missing or check_res.duplicated or check_res.varargs: + raise CheckError(check_res=check_res, + func_signature=get_func_signature(self.evaluate)) refined_args = _build_args(self.evaluate, **mapped_output_dict, **mapped_target_dict) metrics = self.evaluate(**refined_args) @@ -92,7 +98,6 @@ def __init__(self, func, key_map, **kwargs): super().__init__() pass - def _prepare_metrics(metrics): """ diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 33d8cc81..39efb454 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -12,6 +12,7 @@ from fastNLP.core.utils import _move_dict_value_to_device from fastNLP.core.metrics import _prepare_metrics from fastNLP.core.utils import CheckError +from fastNLP.core.utils import _check_loss_evaluate class Tester(object): """An collection of model inference and evaluation of performance, used over validation/dev set and test set. """ @@ -47,7 +48,6 @@ def __init__(self, data, model, metrics, batch_size=16, use_cuda=False, verbose= self._model_device = model.parameters().__next__().device - def test(self): # turn on the testing mode; clean up the history network = self._model @@ -75,7 +75,9 @@ def test(self): metric_name = metric.__class__.__name__ eval_results[metric_name] = eval_result except CheckError as e: - pass + prev_func_signature = get_func_signature(self._predict_func) + _check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature, + check_res=e.check_res, output=output, batch_y=truths) if self.verbose >= 0: diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index da8e54f9..acbcb586 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -20,12 +20,11 @@ from fastNLP.core.utils import _move_dict_value_to_device from fastNLP.core.utils import get_func_signature from fastNLP.core.dataset import DataSet - -from fastNLP.core.losses import LossBase -from fastNLP.core.metrics import MetricBase from fastNLP.core.losses import _prepare_losser from fastNLP.core.metrics import _prepare_metrics from fastNLP.core.utils import CheckError +from fastNLP.core.utils import _check_loss_evaluate +from fastNLP.core.utils import _check_forward_error class Trainer(object): """Main Training Loop @@ -33,7 +32,7 @@ class Trainer(object): """ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, print_every=-1, validate_every=-1, dev_data=None, use_cuda=False, save_path="./save", - optimizer=Optimizer("Adam", lr=0.01, weight_decay=0), need_check_code=True, + optimizer=Optimizer("Adam", lr=0.01, weight_decay=0), check_code_level=0, **kwargs): super(Trainer, self).__init__() @@ -53,8 +52,9 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat # prepare loss losser = _prepare_losser(losser) - if need_check_code: - _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data) + if check_code_level>-1: + _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, + check_level=check_code_level) self.train_data = train_data self.dev_data = dev_data # If None, No validation. @@ -250,13 +250,9 @@ def best_eval_result(self, metrics): DEFAULT_CHECK_BATCH_SIZE = 2 DEFAULT_CHECK_NUM_BATCH = 2 -IGNORE_CHECK_LEVEL = 0 -WARNING_CHECK_LEVEL = 1 -STRICT_CHECK_LEVEL = 2 - def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, dev_data=None, - check_level=WARNING_CHECK_LEVEL): + check_level=0): # check get_loss 方法 model_devcie = model.parameters().__next__().device @@ -265,7 +261,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ _move_dict_value_to_device(model_devcie, batch_x, batch_y) # forward check if batch_count==0: - _check_forward_error(model_func=model.forward, check_level=check_level, + _check_forward_error(forward_func=model.forward, check_level=check_level, batch_x=batch_x) refined_batch_x = _build_args(model.forward, **batch_x) @@ -277,19 +273,21 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ # loss check try: loss = losser(output, batch_y) + # check loss output + if batch_count == 0: + if not isinstance(loss, torch.Tensor): + raise TypeError( + f"The return value of {get_func_signature(losser.get_loss)} should be `torch.Tensor`, " + f"but got `{type(loss)}`.") + if len(loss.size()) != 0: + raise ValueError( + f"The size of return value of {get_func_signature(losser.get_loss)} is {loss.size()}, " + f"should be torch.size([])") + loss.backward() except CheckError as e: _check_loss_evaluate(prev_func=model.forward, func=e.func_signature, check_res=e.check_res, output=output, batch_y=batch_y, check_level=check_level) - # check loss output - if batch_count == 0: - if not isinstance(loss, torch.Tensor): - raise TypeError(f"The return value of {get_func_signature(losser.__call__)} should be `torch.Tensor`, " - f"but got `{type(loss)}`.") - if len(loss.size())!=0: - raise ValueError(f"The size of return value of {get_func_signature(losser.__call__)} is {loss.size()}, " - f"should be torch.size([])") - loss.backward() model.zero_grad() if batch_count+1>=DEFAULT_CHECK_NUM_BATCH: break @@ -300,93 +298,5 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ tester.test() -def _check_forward_error(model_func, check_level, batch_x): - check_res = _check_arg_dict_list(model_func, batch_x) - _missing = '' - _unused = '' - func_signature = get_func_signature(model_func) - if len(check_res['missing'])!=0: - _missing = "Function {} misses {}, only provided with {}, " \ - ".\n".format(func_signature, check_res.missing, - list(batch_x.keys())) - if len(check_res['unused'])!=0: - if len(check_res.unused) > 1: - _unused = "{} are not used ".format(check_res.unused) - else: - _unused = "{} is not used ".format(check_res.unused) - _unused += "in function {}.\n".format(func_signature) - if _missing: - if len(_unused)>0 and STRICT_CHECK_LEVEL: - _error_str = "(1).{}\n(2).{}".format(_missing, _unused) - else: - _error_str = _missing - # TODO 这里可能需要自定义一些Error类型 - raise TypeError(_error_str) - if _unused: - if check_level == STRICT_CHECK_LEVEL: - # TODO 这里可能需要自定义一些Error类型 - raise ValueError(_unused) - elif check_level == WARNING_CHECK_LEVEL: - warnings.warn(message=_unused) - -def _check_loss_evaluate(prev_func, func, check_res, output, batch_y, check_level): - _missing = '' - _unused = '' - _duplicated = '' - func_signature = get_func_signature(func) - prev_func_signature = get_func_signature(prev_func) - if len(check_res.missing)>0: - _missing = "function {} misses argument {}, \n\t only provided with {}(from {}) and " \ - "{}(from target in Dataset)." \ - .format(func_signature, check_res.missing, - list(output.keys()), prev_func_signature, - list(batch_y.keys())) - if len(check_res.unused)>0: - if len(check_res.unused) > 1: - _unused = "{} are not used ".format(check_res.unused) - else: - _unused = "{} is not used ".format(check_res.unused) - _unused += "in function {}.\n".format(func_signature) - if len(check_res.duplicated)>0: - if len(check_res.duplicated) > 1: - _duplicated = "duplicated keys {} are detected when calling function {}. \n\tDon't set {} as target and output " \ - "them in {} at the same time.".format(check_res.duplicated, - func_signature, - check_res.duplicated, - prev_func_signature) - else: - _duplicated = "duplicated key {} is detected when calling function {}. \n\tDon't set {} as target and output " \ - "it in {} at the same time.".format(check_res.duplicated, - func_signature, - check_res.duplicated, - prev_func_signature) - _number_errs = int(len(_missing)!=0) + int(len(_duplicated)!=0) + int(len(_unused)!=0) - if _number_errs > 0: - _error_strs = [] - if _number_errs > 1: - count = 0 - order_words = ['Firstly', 'Secondly', 'Thirdly'] - if _missing: - _error_strs.append('{}, {}'.format(order_words[count], _missing)) - count += 1 - if _duplicated: - _error_strs.append('{}, {}'.format(order_words[count], _duplicated)) - count += 1 - if _unused and check_level == STRICT_CHECK_LEVEL: - _error_strs.append('{}, {}'.format(order_words[count], _unused)) - else: - if _unused: - if check_level == STRICT_CHECK_LEVEL: - # TODO 这里可能需要自定义一些Error类型 - _error_strs.append(_unused) - elif check_level == WARNING_CHECK_LEVEL: - _unused = _unused.strip() - warnings.warn(_unused) - else: - if _missing: - _error_strs.append(_missing) - if _duplicated: - _error_strs.append(_duplicated) - if _error_strs: - raise ValueError('\n' + '\n'.join(_error_strs)) + diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 61c5bc5c..d237c190 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -1,11 +1,14 @@ import _pickle import inspect import os +import warnings from collections import Counter from collections import namedtuple import torch -CheckRes = namedtuple('CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed'], verbose=False) + +CheckRes = namedtuple('CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', + 'varargs'], verbose=False) def save_pickle(obj, pickle_path, file_name): """Save an object into a pickle file. @@ -105,7 +108,6 @@ def _check_arg_dict_list(func, args): assert callable(func) and isinstance(arg_dict_list, (list, tuple)) assert len(arg_dict_list) > 0 and isinstance(arg_dict_list[0], dict) spect = inspect.getfullargspec(func) - assert spect.varargs is None, 'Positional Arguments({}) are not supported.'.format(spect.varargs) all_args = set([arg for arg in spect.args if arg!='self']) defaults = [] if spect.defaults is not None: @@ -125,7 +127,8 @@ def _check_arg_dict_list(func, args): unused=unused, duplicated=duplicated, required=list(require_args), - all_needed=list(all_args)) + all_needed=list(all_args), + varargs=[arg for arg in spect.varargs]) def get_func_signature(func): """ @@ -221,15 +224,73 @@ class CheckError(Exception): CheckError. Used in losses.LossBase, metrics.MetricBase. """ def __init__(self, check_res:CheckRes, func_signature:str): - err = '' + errs = [f'The following problems occurred when calling {func_signature}'] + + if check_res.varargs: + errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, please delete it)") if check_res.missing: - err += f"Missing: {check_res.missing}\n" + errs.append(f"\tmissing param: {check_res.missing}") if check_res.duplicated: - err += f"Duplicated: {check_res.duplicated}\n" + errs.append(f"\tduplicated param: {check_res.duplicated}") if check_res.unused: - err += f"Unused: {check_res.unused}\n" + errs.append(f"\tunused param: {check_res.unused}") - Exception.__init__(self, err) + Exception.__init__(self, '\n'.join(errs)) self.check_res = check_res self.func_signature = func_signature + +IGNORE_CHECK_LEVEL = 0 +WARNING_CHECK_LEVEL = 1 +STRICT_CHECK_LEVEL = 2 + +def _check_loss_evaluate(prev_func_signature:str, func_signature:str, check_res:CheckRes, + output:dict, batch_y:dict, check_level=0): + errs = [] + _unused = [] + if check_res.varargs: + errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, " + f"please delete it.)") + if check_res.missing: + errs.append(f"\tmissing param: {check_res.missing}, only provided with {list(output.keys())}" + f"(from {prev_func_signature}) and {list(batch_y.keys())}(from targets in Dataset).") + if check_res.duplicated: + errs.append(f"\tduplicated param: {check_res.duplicated}, delete {check_res.duplicated} in the output of " + f"{check_res.duplicated} or do not set {check_res.duplicated} as targets. ") + if check_res.unused: + _unused = [f"\tunused param: {check_res.unused}"] + if check_level == STRICT_CHECK_LEVEL: + errs.extend(_unused) + + if len(errs)>0: + errs.insert(0, f'The following problems occurred when calling {func_signature}') + raise NameError('\n'.join(errs)) + if _unused: + if check_level == WARNING_CHECK_LEVEL: + _unused_warn = _unused[0] + f' in {func_signature}.' + warnings.warn(message=_unused_warn) + + +def _check_forward_error(forward_func, batch_x, check_level): + check_res = _check_arg_dict_list(forward_func, batch_x) + func_signature = get_func_signature(forward_func) + + errs = [] + _unused = [] + + if check_res.varargs: + errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, please delete it)") + if check_res.missing: + errs.append(f"\tmissing param: {check_res.missing}, only provided with {list(batch_x.keys())}.") + if check_res.unused: + _unused = [f"\tunused param: {check_res.unused}"] + if check_level == STRICT_CHECK_LEVEL: + errs.extend(_unused) + + if len(errs)>0: + errs.insert(0, f'The following problems occurred when calling {func_signature}') + raise NameError('\n'.join(errs)) + if _unused: + if check_level == WARNING_CHECK_LEVEL: + _unused_warn = _unused[0] + f' in {func_signature}.' + warnings.warn(message=_unused_warn) \ No newline at end of file From 3daa889bb01d0c6edab0ddb1ad7a2a5dbd449cda Mon Sep 17 00:00:00 2001 From: yh Date: Sat, 1 Dec 2018 23:44:07 +0800 Subject: [PATCH 18/67] LossInForward update --- fastNLP/core/losses.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index aa1ffb89..9306f9f9 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -76,14 +76,15 @@ def __init__(self, func, key_map=None, **kwargs): class LossInForward(LossBase): def __init__(self, loss_key='loss'): super().__init__() - self.loss_key = loss_key - def get_loss(self, *args, **kwargs): - pass + def get_loss(self, **kwargs): + if self.loss_key not in kwargs: + pass def __call__(self, output_dict, predict_dict): - pass + + return self.get_loss(**output_dict) def _prepare_losser(losser): From f24fca1b21e23b5692ae8cd89ceac844d4ea94a8 Mon Sep 17 00:00:00 2001 From: yh Date: Sun, 2 Dec 2018 09:21:08 +0800 Subject: [PATCH 19/67] change the calculation of metric to batch by batch. The older design is to concat all data before calculation. --- fastNLP/core/batch.py | 10 +++- fastNLP/core/fieldarray.py | 2 +- fastNLP/core/losses.py | 21 +++++++-- fastNLP/core/metrics.py | 95 +++++++++++++++++++++++++++++++------- fastNLP/core/tester.py | 6 +-- fastNLP/core/trainer.py | 33 +++++++------ fastNLP/core/utils.py | 32 ++++--------- 7 files changed, 133 insertions(+), 66 deletions(-) diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 38da83da..0aca6055 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -1,4 +1,5 @@ import torch +import numpy as np class Batch(object): @@ -45,7 +46,7 @@ def __next__(self): if field.is_target or field.is_input: batch = field.get(indices) if not self.as_numpy: - batch = torch.from_numpy(batch) + batch = to_tensor(batch, field.dtype) if field.is_target: batch_y[field_name] = batch if field.is_input: @@ -54,3 +55,10 @@ def __next__(self): self.curidx = endidx return batch_x, batch_y + +def to_tensor(batch, dtype): + if dtype in (np.int8, np.int16, np.int32, np.int64): + batch = torch.LongTensor(batch) + if dtype in (np.float32, np.float64): + batch = torch.FloatTensor(batch) + return batch \ No newline at end of file diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index f93fbf2e..714fa169 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -39,7 +39,7 @@ def _type_detection(content): @staticmethod def _map_to_np_type(basic_type): - type_mapping = {int: np.int64, float: np.double, str: np.str} + type_mapping = {int: np.int64, float: np.float64, str: np.str} return type_mapping[basic_type] def __repr__(self): diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 85b16e64..564eb7ce 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -126,15 +126,30 @@ def __init__(self): class LossInForward(LossBase): def __init__(self, loss_key='loss'): super().__init__() + if not isinstance(loss_key, str): + raise TypeError(f"Only str allowed for loss_key, got {type(loss_key)}.") self.loss_key = loss_key def get_loss(self, **kwargs): if self.loss_key not in kwargs: - pass + check_res = CheckRes(missing=[self.loss_key], + unused=[], + duplicated=[], + required=[], + all_needed=[], + varargs=[]) + raise CheckError(check_res=check_res, func_signature=get_func_signature(self.get_loss)) - def __call__(self, output_dict, predict_dict): + def __call__(self, output_dict, predict_dict, force_check=False): - return self.get_loss(**output_dict) + loss = self.get_loss(**output_dict) + + if not (isinstance(loss, torch.Tensor) and len(loss.size()) == 0): + if not isinstance(loss, torch.Tensor): + raise TypeError(f"loss ERROR: loss except a torch.Tensor but got {type(loss)}") + raise RuntimeError(f"loss ERROR: the size of loss except torch.Size([]) but got {loss.size}") + + return loss def _prepare_losser(losser): diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 69bb540d..f8fc1d49 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -10,7 +10,7 @@ from fastNLP.core.utils import _check_arg_dict_list from fastNLP.core.utils import _build_args from fastNLP.core.utils import CheckError - +from fastNLP.core.utils import _check_function_or_method class MetricBase(object): def __init__(self): @@ -20,19 +20,32 @@ def __init__(self): def evaluate(self, *args, **kwargs): raise NotImplementedError - def _init_param_map(self, key_map, **kwargs): - self.param_map = {} - value_counter = defaultdict(0) - for key, value in key_map.items(): - if isinstance(key, str): - raise TypeError(f"key in key_map must be `str`, not `{type(key)}`.") - if isinstance(value, str): - raise TypeError(f"value in key_map must be `str`, not `{type(value)}`.") - self.param_map[key] = value + def _init_param_map(self, key_map=None, **kwargs): + value_counter = defaultdict(set) + if key_map is not None: + if not isinstance(key_map, dict): + raise TypeError("key_map must be `dict`, got {}.".format(type(key_map))) + for key, value in key_map.items(): + if value is None: + self.param_map[key] = key + continue + if isinstance(key, str): + raise TypeError(f"key in key_map must be `str`, not `{type(key)}`.") + if isinstance(value, str): + raise TypeError(f"value in key_map must be `str`, not `{type(value)}`.") + self.param_map[key] = value + value_counter[value].add(key) for key, value in kwargs.items(): + if value is None: + self.param_map[key] = key + continue if isinstance(value, str): raise TypeError(f"in {key}={value}, value must be `str`, not `{type(value)}`.") self.param_map[key] = value + value_counter[value].add(key) + for value, key_set in value_counter.items(): + if len(key_set)>1: + raise ValueError(f"Several params:{key_set} are provided with one output {value}.") def __call__(self, output_dict, target_dict, check=False): """ @@ -45,8 +58,6 @@ def __call__(self, output_dict, target_dict, check=False): raise TypeError(f"{self.__class__.__name__}.evaluate has to be callable, not {type(self.evaluate)}.") if not self._checked: - # 0. check param_map does not have same value - # 1. check consistence between signature and param_map func_spect = inspect.getfullargspec(self.evaluate) func_args = func_spect.args @@ -58,26 +69,32 @@ def __call__(self, output_dict, target_dict, check=False): if arg not in self.param_map: self.param_map[arg] = arg #This param does not need mapping. self._evaluate_args = func_args + self._reverse_param_map = {value: key for key, value in self.param_map.items()} # need to wrap inputs in dict. mapped_output_dict = {} mapped_target_dict = {} for func_arg in self._evaluate_args: input_arg = self.param_map[func_arg] + if input_arg in self._reverse_param_map: + mapped_arg = func_arg + else: + mapped_arg = input_arg if input_arg in output_dict: - mapped_output_dict[func_arg] = output_dict[input_arg] + mapped_output_dict[mapped_arg] = output_dict[input_arg] if input_arg in target_dict: - mapped_target_dict[func_arg] = target_dict[input_arg] + mapped_target_dict[mapped_arg] = target_dict[input_arg] # check duplicated, unused, missing if check or not self._checked: check_res = _check_arg_dict_list(self.evaluate, [mapped_output_dict, mapped_output_dict]) - self._reverse_param_map = {value:key for key, value in check_res.items()} for key, value in check_res.items(): new_value = list(value) for idx, func_param in enumerate(value): if func_param in self._reverse_param_map: - new_value[idx] = self._reverse_param_map[func_param] + new_value[idx] = self._reverse_param_map[func_param] + f'(assign to {func_param})' + else: + new_value[idx] = func_param if check_res.missing or check_res.duplicated or check_res.varargs: raise CheckError(check_res=check_res, func_signature=get_func_signature(self.evaluate)) @@ -93,11 +110,55 @@ def __call__(self, output_dict, target_dict, check=False): return metrics -class Metric(MetricBase): +class FuncMetric(MetricBase): def __init__(self, func, key_map, **kwargs): super().__init__() + + _check_function_or_method(func=func) + self._init_param_map(key_map=key_map, **kwargs) + + self.evaluate = func + + +class AccuracyMetric(MetricBase): + def __init__(self, predictions=None, targets=None, masks=None, seq_lens=None): + super().__init__() + + self._init_param_map(predictions=predictions, targets=targets, + masks=masks, seq_lens=seq_lens) + + def evaluate(self, predictions, targets, masks=None, seq_lens=None): + """ + + :param predictions: List of (torch.Tensor, or numpy.ndarray). Element's shape can be: + torch.Size([]), torch.Size([n_classes,]), torch.Size([max_len,]), torch.Size([max_len, n_classes]) + :param targets: List of (torch.Tensor, or numpy.ndarray). Element's can be: + torch.Size([]), torch.Size([]), torch.Size([max_len,]), torch.Size([max_len, ]) + :param masks: List of (torch.Tensor, or numpy.ndarray). Element's can be: + None, None, torch.Size([max_len,], torch.Size([max_len, ]) + :param seq_lens: List of (torch.Tensor, or numpy.ndarray). Element's can be: + None, None, torch.Size([1], torch.Size([1]) + :return: dict({'acc': float}) + """ pass + def _check_evaluate_param(self, predictions, targets, masks=None, seq_lens=None): + # check the validity of self.evaluate param + prediction = predictions[0] + target = targets[0] + + if len(np.shape(prediction))==len(target): + pass + + if masks is not None: + mask = masks[0] + if seq_lens is not None: + seq_len = seq_lens[0] + + + + + def _prepare_metrics(metrics): """ diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 39efb454..e809cd06 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -7,11 +7,11 @@ from fastNLP.core.batch import Batch from fastNLP.core.sampler import SequentialSampler from fastNLP.core.dataset import DataSet +from fastNLP.core.utils import CheckError from fastNLP.core.utils import _build_args from fastNLP.core.utils import get_func_signature from fastNLP.core.utils import _move_dict_value_to_device from fastNLP.core.metrics import _prepare_metrics -from fastNLP.core.utils import CheckError from fastNLP.core.utils import _check_loss_evaluate class Tester(object): @@ -57,7 +57,7 @@ def test(self): with torch.no_grad(): for batch_x, batch_y in data_iterator: - _move_dict_value_to_device(self._model_device, batch_x, batch_y) + _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) prediction = self._data_forward(self._predict_func, batch_x) assert isinstance(prediction, dict) for k, v in prediction.items(): @@ -77,7 +77,7 @@ def test(self): except CheckError as e: prev_func_signature = get_func_signature(self._predict_func) _check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature, - check_res=e.check_res, output=output, batch_y=truths) + check_res=e.check_res, output=output, batch_y=truths, check_level=0) if self.verbose >= 0: diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 39d76521..6d31e390 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -1,6 +1,5 @@ import os import time -import warnings from datetime import datetime from datetime import timedelta @@ -9,24 +8,19 @@ from torch import nn from fastNLP.core.batch import Batch -from fastNLP.core.dataset import DataSet -from fastNLP.core.losses import _prepare_losser -from fastNLP.core.metrics import _prepare_metrics from fastNLP.core.optimizer import Adam from fastNLP.core.sampler import RandomSampler from fastNLP.core.sampler import SequentialSampler from fastNLP.core.tester import Tester -from fastNLP.core.utils import CheckError -from fastNLP.core.utils import _build_args -from fastNLP.core.utils import _check_arg_dict_list -from fastNLP.core.utils import _move_dict_value_to_device -from fastNLP.core.utils import get_func_signature from fastNLP.core.dataset import DataSet from fastNLP.core.losses import _prepare_losser from fastNLP.core.metrics import _prepare_metrics from fastNLP.core.utils import CheckError from fastNLP.core.utils import _check_loss_evaluate from fastNLP.core.utils import _check_forward_error +from fastNLP.core.utils import _build_args +from fastNLP.core.utils import _move_dict_value_to_device +from fastNLP.core.utils import get_func_signature class Trainer(object): """Main Training Loop @@ -52,6 +46,9 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat if metrics and (dev_data is None): raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ") + # check save_path + if not (save_path is None or isinstance(save_path, str)): + raise ValueError("save_path can only be None or `str`.") # prepare evaluate metrics = _prepare_metrics(metrics) @@ -156,7 +153,7 @@ def _train_epoch(self, data_iterator, model, epoch, start): """ for batch_x, batch_y in data_iterator: # TODO 这里可能会遇到问题,万一用户在model内部修改了prediction的device就会有问题 - _move_dict_value_to_device(self._model_device, batch_x, batch_y) + _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) prediction = self._data_forward(model, batch_x) loss = self._compute_loss(prediction, batch_y) self._grad_backward(loss) @@ -232,11 +229,12 @@ def _compute_loss(self, predict, truth): return self.losser(predict, truth) def _save_model(self, model, model_name, only_param=False): - model_name = os.path.join(self.save_path, model_name) - if only_param: - torch.save(model.state_dict(), model_name) - else: - torch.save(model, model_name) + if self.save_path is not None: + model_name = os.path.join(self.save_path, model_name) + if only_param: + torch.save(model.state_dict(), model_name) + else: + torch.save(model, model_name) def _better_eval_result(self, metrics): """Check if the current epoch yields better validation results. @@ -297,7 +295,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_count, (batch_x, batch_y) in enumerate(batch): - _move_dict_value_to_device(model_devcie, batch_x, batch_y) + _move_dict_value_to_device(batch_x, batch_y, device=model_devcie) # forward check if batch_count==0: _check_forward_error(forward_func=model.forward, check_level=check_level, @@ -335,6 +333,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ if dev_data is not None: tester = Tester(data=dataset[:batch_size * DEFAULT_CHECK_NUM_BATCH], model=model, metrics=metrics, batch_size=batch_size, verbose=-1) - tester.test() + evaluate_results = tester.test() + # TODO 这里需要检查是否返回来的值是否是合理的 diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index d237c190..cfc77f46 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -122,13 +122,13 @@ def _check_arg_dict_list(func, args): input_args = set(input_arg_count.keys()) missing = list(require_args - input_args) unused = list(input_args - all_args) - + varargs = [] if spect.varargs else [arg for arg in spect.varargs] return CheckRes(missing=missing, unused=unused, duplicated=duplicated, required=list(require_args), all_needed=list(all_args), - varargs=[arg for arg in spect.varargs]) + varargs=varargs) def get_func_signature(func): """ @@ -165,6 +165,7 @@ def forward(self, a, b='a', **args) signature_str = func.__name__ + signature_str return signature_str + def _is_function_or_method(func): """ @@ -179,26 +180,8 @@ def _check_function_or_method(func): if not _is_function_or_method(func): raise TypeError(f"{type(func)} is not a method or function.") -def _syn_model_data(model, *args): - """ - - move data to model's device, element in *args should be dict. This is a inplace change. - :param model: - :param args: - :return: - """ - if len(model.state_dict())==0: - raise ValueError("model has no parameter.") - device = model.parameters().__next__().device - for arg in args: - if isinstance(arg, dict): - for key, value in arg.items(): - if isinstance(value, torch.Tensor): - arg[key] = value.to(device) - else: - raise TypeError("Only support `dict` type right now.") -def _move_dict_value_to_device(device, *args): +def _move_dict_value_to_device(*args, device:torch.device): """ move data to model's device, element in *args should be dict. This is a inplace change. @@ -240,6 +223,7 @@ def __init__(self, check_res:CheckRes, func_signature:str): self.check_res = check_res self.func_signature = func_signature + IGNORE_CHECK_LEVEL = 0 WARNING_CHECK_LEVEL = 1 STRICT_CHECK_LEVEL = 2 @@ -252,8 +236,8 @@ def _check_loss_evaluate(prev_func_signature:str, func_signature:str, check_res: errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, " f"please delete it.)") if check_res.missing: - errs.append(f"\tmissing param: {check_res.missing}, only provided with {list(output.keys())}" - f"(from {prev_func_signature}) and {list(batch_y.keys())}(from targets in Dataset).") + errs.append(f"\tmissing param: `{check_res.missing}`, provided with `{list(output.keys())}`" + f"(from output of `{prev_func_signature}`) and `{list(batch_y.keys())}`(from targets in Dataset).") if check_res.duplicated: errs.append(f"\tduplicated param: {check_res.duplicated}, delete {check_res.duplicated} in the output of " f"{check_res.duplicated} or do not set {check_res.duplicated} as targets. ") @@ -281,7 +265,7 @@ def _check_forward_error(forward_func, batch_x, check_level): if check_res.varargs: errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, please delete it)") if check_res.missing: - errs.append(f"\tmissing param: {check_res.missing}, only provided with {list(batch_x.keys())}.") + errs.append(f"\tmissing param: {check_res.missing}, provided with {list(batch_x.keys())}.") if check_res.unused: _unused = [f"\tunused param: {check_res.unused}"] if check_level == STRICT_CHECK_LEVEL: From bd94dd2c7f6ab3465c07a7bc2884d847e3315911 Mon Sep 17 00:00:00 2001 From: yh Date: Sun, 2 Dec 2018 10:30:25 +0800 Subject: [PATCH 20/67] =?UTF-8?q?metrics=E4=B8=AD=E5=AE=9E=E7=8E=B0Accurac?= =?UTF-8?q?yMetric,=20=E5=B9=B6=E5=B0=86metric=E7=9A=84=E8=AE=A1=E7=AE=97?= =?UTF-8?q?=E6=96=B9=E5=BC=8F=E7=94=B1=E4=B8=80=E6=8A=8A=E8=AE=A1=E7=AE=97?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=B8=BAbatch=20by=20batch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/metrics.py | 90 +++++++++++++++++++++++++---------------- fastNLP/core/tester.py | 37 ++++++++--------- fastNLP/core/utils.py | 35 +++++++++++++++- 3 files changed, 105 insertions(+), 57 deletions(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index f8fc1d49..e599ec7b 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -10,7 +10,7 @@ from fastNLP.core.utils import _check_arg_dict_list from fastNLP.core.utils import _build_args from fastNLP.core.utils import CheckError -from fastNLP.core.utils import _check_function_or_method +from fastNLP.core.utils import seq_lens_to_masks class MetricBase(object): def __init__(self): @@ -21,6 +21,13 @@ def evaluate(self, *args, **kwargs): raise NotImplementedError def _init_param_map(self, key_map=None, **kwargs): + """ + + check the validity of key_map and other param map. Add these into self.param_map + :param key_map: dict + :param kwargs: + :return: None + """ value_counter = defaultdict(set) if key_map is not None: if not isinstance(key_map, dict): @@ -47,6 +54,9 @@ def _init_param_map(self, key_map=None, **kwargs): if len(key_set)>1: raise ValueError(f"Several params:{key_set} are provided with one output {value}.") + def get_metric(self, reset=True): + raise NotImplemented + def __call__(self, output_dict, target_dict, check=False): """ :param output_dict: @@ -100,25 +110,9 @@ def __call__(self, output_dict, target_dict, check=False): func_signature=get_func_signature(self.evaluate)) refined_args = _build_args(self.evaluate, **mapped_output_dict, **mapped_target_dict) - metrics = self.evaluate(**refined_args) - - if not isinstance(metrics, dict): - raise TypeError(f"The return value of {get_func_signature(self.evaluate)} must be `dict`, " - f"got {type(metrics)}.") + self.evaluate(**refined_args) self._checked = True - return metrics - - -class FuncMetric(MetricBase): - def __init__(self, func, key_map, **kwargs): - super().__init__() - - _check_function_or_method(func=func) - self._init_param_map(key_map=key_map, **kwargs) - - self.evaluate = func - class AccuracyMetric(MetricBase): def __init__(self, predictions=None, targets=None, masks=None, seq_lens=None): @@ -127,35 +121,61 @@ def __init__(self, predictions=None, targets=None, masks=None, seq_lens=None): self._init_param_map(predictions=predictions, targets=targets, masks=masks, seq_lens=seq_lens) + self.total = 0 + self.acc_count = 0 + def evaluate(self, predictions, targets, masks=None, seq_lens=None): """ :param predictions: List of (torch.Tensor, or numpy.ndarray). Element's shape can be: - torch.Size([]), torch.Size([n_classes,]), torch.Size([max_len,]), torch.Size([max_len, n_classes]) + torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), torch.Size([B, max_len, n_classes]) :param targets: List of (torch.Tensor, or numpy.ndarray). Element's can be: - torch.Size([]), torch.Size([]), torch.Size([max_len,]), torch.Size([max_len, ]) + torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), torch.Size([B, max_len]) :param masks: List of (torch.Tensor, or numpy.ndarray). Element's can be: - None, None, torch.Size([max_len,], torch.Size([max_len, ]) + None, None, torch.Size([B, max_len], torch.Size([B, max_len]) :param seq_lens: List of (torch.Tensor, or numpy.ndarray). Element's can be: - None, None, torch.Size([1], torch.Size([1]) + None, None, torch.Size([B], torch.Size([B]). ignored if masks are provided. :return: dict({'acc': float}) """ - pass - - def _check_evaluate_param(self, predictions, targets, masks=None, seq_lens=None): - # check the validity of self.evaluate param - prediction = predictions[0] - target = targets[0] - - if len(np.shape(prediction))==len(target): + if not isinstance(predictions, torch.Tensor): + raise NameError(f"`predictions` in {get_func_signature(self.evaluate())} expects torch.Tensor," + f"got {type(predictions)}.") + if not isinstance(targets, torch.Tensor): + raise NameError(f"`targets` in {get_func_signature(self.evaluate())} expects torch.Tensor," + f"got {type(targets)}.") + + if masks is not None and not isinstance(masks, torch.Tensor): + raise NameError(f"`masks` in {get_func_signature(self.evaluate())} expects torch.Tensor," + f"got {type(masks)}.") + elif seq_lens is not None and not isinstance(seq_lens, torch.Tensor): + raise NameError(f"`seq_lens` in {get_func_signature(self.evaluate())} expects torch.Tensor," + f"got {type(seq_lens)}.") + + if masks is None and seq_lens is not None: + masks = seq_lens_to_masks(seq_lens=seq_lens, float=True) + + if predictions.size()==targets.size(): pass + elif len(predictions.size())==len(targets.size())+1: + predictions = predictions.argmax(dim=-1) + else: + raise RuntimeError(f"In {get_func_signature(self.evaluate())}, when predictions with " + f"size:{predictions.size()}, targets should with size: {predictions.size()} or " + f"{predictions.size()[:-1]}, got {targets.size()}.") if masks is not None: - mask = masks[0] - if seq_lens is not None: - seq_len = seq_lens[0] - - + self.acc_count += torch.sum(torch.eq(predictions, targets).float() * masks.float()).item() + self.total += torch.sum(masks.float()).item() + else: + self.acc_count += torch.sum(torch.eq(predictions, targets).float()).item() + self.total += np.prod(list(torch.size(predictions))) + + def get_metric(self, reset=True): + evaluate_result = {'acc': self.acc_count/self.total} + if reset: + self.acc_count = 0 + self.total = 0 + return evaluate_result diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index e809cd06..f62d9337 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -54,32 +54,29 @@ def test(self): self._mode(network, is_test=True) output, truths = defaultdict(list), defaultdict(list) data_iterator = Batch(self.data, self.batch_size, sampler=SequentialSampler(), as_numpy=False) - - with torch.no_grad(): - for batch_x, batch_y in data_iterator: - _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) - prediction = self._data_forward(self._predict_func, batch_x) - assert isinstance(prediction, dict) - for k, v in prediction.items(): - output[k].append(v) - for k, v in batch_y.items(): - truths[k].append(v) - for k, v in output.items(): - output[k] = itertools.chain(*v) - for k, v in truths.items(): - truths[k] = itertools.chain(*v) - eval_results = {} + eval_results = {} try: - for metric in self.metrics: - eval_result = metric(output, truths) - metric_name = metric.__class__.__name__ - eval_results[metric_name] = eval_result + with torch.no_grad(): + for batch_x, batch_y in data_iterator: + _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) + prediction = self._data_forward(self._predict_func, batch_x) + if not isinstance(prediction, dict): + raise TypeError(f"The return value of {get_func_signature(self._predict_func)} " + f"must be `dict`, got {type(prediction)}.") + for metric in self.metrics: + metric(prediction, batch_y) + for metric in self.metrics: + eval_result = metric.get_metric() + if not isinstance(eval_result, dict): + raise TypeError(f"The return value of {get_func_signature(metric.get_metric)} must be " + f"`dict`, got {type(eval_result)}") + metric_name = metric.__class__.__name__ + eval_results[metric_name] = eval_result except CheckError as e: prev_func_signature = get_func_signature(self._predict_func) _check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature, check_res=e.check_res, output=output, batch_y=truths, check_level=0) - if self.verbose >= 0: print("[tester] \n{}".format(self._format_eval_results(eval_results))) self._mode(network, is_test=False) diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index cfc77f46..08640d0f 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -4,7 +4,9 @@ import warnings from collections import Counter from collections import namedtuple + import torch +import numpy as np CheckRes = namedtuple('CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', @@ -98,7 +100,6 @@ def _get_arg_list(func): return args, defaults, defaults_val, varargs, kwargs - # check args def _check_arg_dict_list(func, args): if isinstance(args, dict): @@ -277,4 +278,34 @@ def _check_forward_error(forward_func, batch_x, check_level): if _unused: if check_level == WARNING_CHECK_LEVEL: _unused_warn = _unused[0] + f' in {func_signature}.' - warnings.warn(message=_unused_warn) \ No newline at end of file + warnings.warn(message=_unused_warn) + + +def seq_lens_to_masks(seq_lens, float=True): + """ + + Convert seq_lens to masks. + :param seq_lens: list, np.ndarray, or torch.LongTensor, shape should all be (B,) + :param float: if True, the return masks is in float type, otherwise it is byte. + :return: list, np.ndarray or torch.Tensor, shape will be (B, max_length) + """ + if isinstance(seq_lens, np.ndarray): + assert len(np.shape(seq_lens))==1, f"seq_lens can only have one dimension, got {len(np.shape(seq_lens))}." + assert seq_lens.dtype in (int, np.int32, np.int64), f"seq_lens can only be integer, not {seq_lens.dtype}." + raise NotImplemented + elif isinstance(seq_lens, torch.LongTensor): + assert len(seq_lens.size())==1, f"seq_lens can only have one dimension, got {len(seq_lens.size())==1}." + batch_size = seq_lens.size(0) + max_len = seq_lens.max() + indexes = torch.arange(max_len).view(1, -1).repeat(batch_size, 1).to(seq_lens.device) + masks = indexes.lt(seq_lens.unsqueeze(1)) + + if float: + masks = masks.float() + + return masks + elif isinstance(seq_lens, list): + raise NotImplemented + else: + raise NotImplemented + From 84024aaaa4a2a6be91fec1162250d5a03fe30bc7 Mon Sep 17 00:00:00 2001 From: yh Date: Sun, 2 Dec 2018 10:36:20 +0800 Subject: [PATCH 21/67] =?UTF-8?q?=5Fprepare=5Fmetric=E5=87=BD=E6=95=B0?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=A3=80=E6=9F=A5evaluate=E4=B8=8Eget=5Fmetr?= =?UTF-8?q?ic=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/metrics.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index e599ec7b..5296b0bf 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -193,6 +193,11 @@ def _prepare_metrics(metrics): if isinstance(metric, type): metric = metric() if isinstance(metric, MetricBase): + metric_name = metric.__class__.__name__ + if not callable(metric.evaluate): + raise TypeError(f"{metric_name}.evaluate must be callable, got {type(metric.evaluate)}.") + if not callable(metric.get_metric): + raise TypeError(f"{metric_name}.get_metric must be callable, got {type(metric.get_metric)}.") _metrics.append(metric) else: raise TypeError(f"The type of metric in metrics must be `fastNLP.MetricBase`, not `{type(metric)}`.") From fb5215ae733ec50bcb6b71626db9ea7d8486a56a Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sun, 2 Dec 2018 10:58:10 +0800 Subject: [PATCH 22/67] =?UTF-8?q?fix=20bug=20in=20Trainer=20about=20metric?= =?UTF-8?q?=5Fkey=20=E6=9B=B4=E6=96=B0Optimizer:=20=E5=A4=9A=E7=A7=8D?= =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E6=96=B9=E6=B3=95=201.=20SGD()=202.?= =?UTF-8?q?=20SGD(0.01)=203.=20SGD(lr=3D0.01)=204.=20SGD(lr=3D0.01,=20mome?= =?UTF-8?q?ntum=3D0.9)=205.=20SGD(model.parameters(),=20lr=3D0.1,=20moment?= =?UTF-8?q?um=3D0.9)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/optimizer.py | 58 ++++++++++++++++++++++++++++++++++--- fastNLP/core/trainer.py | 20 ++++++++----- test/core/test_optimizer.py | 43 ++++++++++++++++++++------- 3 files changed, 99 insertions(+), 22 deletions(-) diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py index 72737b81..4cb21462 100644 --- a/fastNLP/core/optimizer.py +++ b/fastNLP/core/optimizer.py @@ -3,14 +3,41 @@ class Optimizer(object): def __init__(self, model_params, **kwargs): - if model_params is not None and not isinstance(model_params, torch.Tensor): - raise RuntimeError("model parameters should be torch.Tensor, rather than {}".format(type(model_params))) + if model_params is not None and not hasattr(model_params, "__next__"): + raise RuntimeError("model parameters should be a generator, rather than {}".format(type(model_params))) self.model_params = model_params self.settings = kwargs class SGD(Optimizer): - def __init__(self, model_params=None, lr=0.001, momentum=0.9): + def __init__(self, *args, **kwargs): + model_params, lr, momentum = None, 0.01, 0.9 + if len(args) == 0 and len(kwargs) == 0: + # SGD() + pass + elif len(args) == 1 and len(kwargs) == 0: + if isinstance(args[0], float) or isinstance(args[0], int): + # SGD(0.001) + lr = args[0] + elif hasattr(args[0], "__next__"): + # SGD(model.parameters()) args[0] is a generator + model_params = args[0] + else: + raise RuntimeError("Not supported type {}.".format(type(args[0]))) + elif 2 >= len(kwargs) > 0 and len(args) <= 1: + # SGD(lr=0.01), SGD(lr=0.01, momentum=0.9), SGD(model.parameters(), lr=0.1, momentum=0.9) + if len(args) == 1: + if hasattr(args[0], "__next__"): + model_params = args[0] + else: + raise RuntimeError("Not supported type {}.".format(type(args[0]))) + if not all(key in ("lr", "momentum") for key in kwargs): + raise RuntimeError("Invalid SGD arguments. Expect {}, got {}.".format(("lr", "momentum"), kwargs)) + lr = kwargs.get("lr", 0.01) + momentum = kwargs.get("momentum", 0.9) + else: + raise RuntimeError("SGD only accept 0 or 1 sequential argument, but got {}: {}".format(len(args), args)) + super(SGD, self).__init__(model_params, lr=lr, momentum=momentum) def construct_from_pytorch(self, model_params): @@ -20,7 +47,30 @@ def construct_from_pytorch(self, model_params): class Adam(Optimizer): - def __init__(self, model_params=None, lr=0.001, weight_decay=0.8): + def __init__(self, *args, **kwargs): + model_params, lr, weight_decay = None, 0.01, 0.9 + if len(args) == 0 and len(kwargs) == 0: + pass + elif len(args) == 1 and len(kwargs) == 0: + if isinstance(args[0], float) or isinstance(args[0], int): + lr = args[0] + elif hasattr(args[0], "__next__"): + model_params = args[0] + else: + raise RuntimeError("Not supported type {}.".format(type(args[0]))) + elif 2 >= len(kwargs) > 0 and len(args) <= 1: + if len(args) == 1: + if hasattr(args[0], "__next__"): + model_params = args[0] + else: + raise RuntimeError("Not supported type {}.".format(type(args[0]))) + if not all(key in ("lr", "weight_decay") for key in kwargs): + raise RuntimeError("Invalid Adam arguments. Expect {}, got {}.".format(("lr", "weight_decay"), kwargs)) + lr = kwargs.get("lr", 0.01) + weight_decay = kwargs.get("weight_decay", 0.9) + else: + raise RuntimeError("Adam only accept 0 or 1 sequential argument, but got {}: {}".format(len(args), args)) + super(Adam, self).__init__(model_params, lr=lr, weight_decay=weight_decay) def construct_from_pytorch(self, model_params): diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 6d31e390..2a5a59e4 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -56,7 +56,10 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat # increase_better is True. It means the exp result gets better if the indicator increases. # It is true by default. self.increase_better = False if metric_key[0] == "-" else True - self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key + if metric_key is not None: + self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key + else: + self.metric_key = None # prepare loss losser = _prepare_losser(losser) @@ -144,12 +147,13 @@ def pass_func(*args, **kwargs): del self._summary_writer def _train_epoch(self, data_iterator, model, epoch, start): - """Training process in one epoch. + """ - kwargs should contain: - - n_print: int, print training information every n steps. - - start: time.time(), the starting time of this step. - - epoch: int, + :param data_iterator: + :param model: + :param epoch: + :param start: + :return: """ for batch_x, batch_y in data_iterator: # TODO 这里可能会遇到问题,万一用户在model内部修改了prediction的device就会有问题 @@ -188,7 +192,7 @@ def _mode(self, model, is_test=False): """Train mode or Test mode. This is for PyTorch currently. :param model: a PyTorch model - :param is_test: bool, whether in test mode or not. + :param bool is_test: whether in test mode or not. """ if is_test: @@ -263,7 +267,7 @@ def _better_eval_result(self, metrics): else: # metric_key is set if self.metric_key not in metric_dict: - raise RuntimeError(f"matric key {self.metric_key} not found in {metric_dict}") + raise RuntimeError(f"metric key {self.metric_key} not found in {metric_dict}") indicator_val = metric_dict[self.metric_key] is_better = True diff --git a/test/core/test_optimizer.py b/test/core/test_optimizer.py index 26e47d43..ab18b9be 100644 --- a/test/core/test_optimizer.py +++ b/test/core/test_optimizer.py @@ -2,20 +2,43 @@ import torch -from fastNLP.core.optimizer import SGD +from fastNLP.core.optimizer import SGD, Adam class TestOptim(unittest.TestCase): - def test_case(self): - optim = SGD(torch.LongTensor(10)) - print(optim.__dict__) + def test_SGD(self): + optim = SGD(torch.nn.Linear(10, 3).parameters()) + self.assertTrue("lr" in optim.__dict__["settings"]) + self.assertTrue("momentum" in optim.__dict__["settings"]) - optim_2 = SGD(lr=0.001) - print(optim_2.__dict__) + optim = SGD(0.001) + self.assertEqual(optim.__dict__["settings"]["lr"], 0.001) - optim_2 = SGD(lr=0.002, momentum=0.989) - print(optim_2.__dict__) + optim = SGD(lr=0.001) + self.assertEqual(optim.__dict__["settings"]["lr"], 0.001) - def test_case_2(self): + optim = SGD(lr=0.002, momentum=0.989) + self.assertEqual(optim.__dict__["settings"]["lr"], 0.002) + self.assertEqual(optim.__dict__["settings"]["momentum"], 0.989) + + with self.assertRaises(RuntimeError): + _ = SGD("???") with self.assertRaises(RuntimeError): - _ = SGD(0.001) + _ = SGD(0.001, lr=0.002) + with self.assertRaises(RuntimeError): + _ = SGD(lr=0.009, shit=9000) + + def test_Adam(self): + optim = Adam(torch.nn.Linear(10, 3).parameters()) + self.assertTrue("lr" in optim.__dict__["settings"]) + self.assertTrue("weight_decay" in optim.__dict__["settings"]) + + optim = Adam(0.001) + self.assertEqual(optim.__dict__["settings"]["lr"], 0.001) + + optim = Adam(lr=0.001) + self.assertEqual(optim.__dict__["settings"]["lr"], 0.001) + + optim = Adam(lr=0.002, weight_decay=0.989) + self.assertEqual(optim.__dict__["settings"]["lr"], 0.002) + self.assertEqual(optim.__dict__["settings"]["weight_decay"], 0.989) From d74901e0379ea8cf78dd62c6f2bfaf40dee9facf Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sun, 2 Dec 2018 11:36:35 +0800 Subject: [PATCH 23/67] =?UTF-8?q?Trainer=20Update:=20*=20=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E5=88=9D=E5=A7=8B=E5=8C=96=E6=B3=A8=E9=87=8A=20*=20?= =?UTF-8?q?=E4=BB=8E=5Fbetter=5Feval=5Fresult=E4=B8=AD=E6=8A=BD=E5=8F=96ch?= =?UTF-8?q?eck=20metrics=E7=9A=84=E9=80=BB=E8=BE=91=E5=88=B0=5Fcheck=5Feva?= =?UTF-8?q?l=5Fresults=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/trainer.py | 123 +++++++++++++++++++++++++--------------- 1 file changed, 78 insertions(+), 45 deletions(-) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 2a5a59e4..78a26334 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -8,20 +8,21 @@ from torch import nn from fastNLP.core.batch import Batch +from fastNLP.core.dataset import DataSet +from fastNLP.core.losses import _prepare_losser +from fastNLP.core.metrics import _prepare_metrics from fastNLP.core.optimizer import Adam from fastNLP.core.sampler import RandomSampler from fastNLP.core.sampler import SequentialSampler from fastNLP.core.tester import Tester -from fastNLP.core.dataset import DataSet -from fastNLP.core.losses import _prepare_losser -from fastNLP.core.metrics import _prepare_metrics from fastNLP.core.utils import CheckError -from fastNLP.core.utils import _check_loss_evaluate -from fastNLP.core.utils import _check_forward_error from fastNLP.core.utils import _build_args +from fastNLP.core.utils import _check_forward_error +from fastNLP.core.utils import _check_loss_evaluate from fastNLP.core.utils import _move_dict_value_to_device from fastNLP.core.utils import get_func_signature + class Trainer(object): """Main Training Loop @@ -33,6 +34,30 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat optimizer=Adam(lr=0.01, weight_decay=0), check_code_level=0, metric_key=None, **kwargs): + """ + + :param DataSet train_data: the training data + :param torch.nn.modules.module model: a PyTorch model + :param LossBase losser: a loss object + :param MetricBase or List[MetricBase] metrics: a metric object or a list of metrics + :param int n_epochs: the number of training epochs + :param int batch_size: batch size for training and validation + :param int print_every: step interval to print next training information. Default: -1(no print). + :param int validate_every: step interval to do next validation. Default: -1(validate every epoch). + :param DataSet dev_data: the validation data + :param use_cuda: + :param str save_path: file path to save models + :param Optimizer optimizer: an optimizer object + :param int check_code_level: level of FastNLP code checker. 0: ignore. 1: warning. 2: strict. + :param str metric_key: a single indicator used to decide the best model based on metric results. It must be one + of the keys returned by the FIRST metric in `metrics`. If the overall result gets better if the indicator gets + smaller, add a `-` character in front of the string. For example + :: + metric_key="-PPL" # language model gets better as perplexity gets smaller + + :param kwargs: + + """ super(Trainer, self).__init__() if not isinstance(train_data, DataSet): @@ -64,7 +89,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat # prepare loss losser = _prepare_losser(losser) - if check_code_level>-1: + if check_code_level > -1: _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, check_level=check_code_level) @@ -245,52 +270,29 @@ def _better_eval_result(self, metrics): :return bool value: True means current results on dev set is the best. """ - if isinstance(metrics, tuple): - loss, metrics = metrics - - if isinstance(metrics, dict): - if len(metrics) == 1: - # only single metric, just use it - metric_dict = list(metrics.values())[0] - metrics_name = list(metrics.keys())[0] - else: - metrics_name = self.metrics[0].__class__.__name__ - if metrics_name not in metrics: - raise RuntimeError(f"{metrics_name} is chosen to do validation, but got {metrics}") - metric_dict = metrics[metrics_name] - - if len(metric_dict) == 1: - indicator_val, indicator = list(metric_dict.values())[0], list(metric_dict.keys())[0] - elif len(metric_dict) > 1 and self.metric_key is None: - raise RuntimeError( - f"Got multiple metric keys: {metric_dict}, but metric_key is not set. Which one to use?") - else: - # metric_key is set - if self.metric_key not in metric_dict: - raise RuntimeError(f"metric key {self.metric_key} not found in {metric_dict}") - indicator_val = metric_dict[self.metric_key] - - is_better = True - if self.best_metric_indicator is None: - # first-time validation - self.best_metric_indicator = indicator_val + indicator_val = _check_eval_results(metrics, self.metric_key, self.metrics) + is_better = True + if self.best_metric_indicator is None: + # first-time validation + self.best_metric_indicator = indicator_val + else: + if self.increase_better is True: + if indicator_val > self.best_metric_indicator: + self.best_metric_indicator = indicator_val + else: + is_better = False else: - if self.increase_better is True: - if indicator_val > self.best_metric_indicator: - self.best_metric_indicator = indicator_val - else: - is_better = False + if indicator_val < self.best_metric_indicator: + self.best_metric_indicator = indicator_val else: - if indicator_val < self.best_metric_indicator: - self.best_metric_indicator = indicator_val - else: - is_better = False - return is_better + is_better = False + return is_better DEFAULT_CHECK_BATCH_SIZE = 2 DEFAULT_CHECK_NUM_BATCH = 2 + def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, dev_data=None, check_level=0): @@ -341,3 +343,34 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ # TODO 这里需要检查是否返回来的值是否是合理的 +def _check_eval_results(metrics, metric_key, metric_list): + # metrics: tester返回的结果 + # metric_key: 一个用来做筛选的指标,来自Trainer的初始化 + # metric_list: 多个用来做评价的指标,来自Trainer的初始化 + if isinstance(metrics, tuple): + loss, metrics = metrics + + if isinstance(metrics, dict): + if len(metrics) == 1: + # only single metric, just use it + metric_dict = list(metrics.values())[0] + metrics_name = list(metrics.keys())[0] + else: + metrics_name = metric_list[0].__class__.__name__ + if metrics_name not in metrics: + raise RuntimeError(f"{metrics_name} is chosen to do validation, but got {metrics}") + metric_dict = metrics[metrics_name] + + if len(metric_dict) == 1: + indicator_val, indicator = list(metric_dict.values())[0], list(metric_dict.keys())[0] + elif len(metric_dict) > 1 and metric_key is None: + raise RuntimeError( + f"Got multiple metric keys: {metric_dict}, but metric_key is not set. Which one to use?") + else: + # metric_key is set + if metric_key not in metric_dict: + raise RuntimeError(f"metric key {metric_key} not found in {metric_dict}") + indicator_val = metric_dict[metric_key] + else: + raise RuntimeError("Invalid metrics type. Expect {}, got {}".format((tuple, dict), type(metrics))) + return indicator_val From a05ffd31cd07f5ebce511260ec086d406c47d332 Mon Sep 17 00:00:00 2001 From: yh Date: Sun, 2 Dec 2018 12:55:15 +0800 Subject: [PATCH 24/67] =?UTF-8?q?trainer=E5=A2=9E=E5=8A=A0=E5=AF=B9evaluat?= =?UTF-8?q?e=E7=BB=93=E6=9E=9C=E7=9A=84check?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/trainer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 78a26334..2c57057f 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -48,7 +48,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat :param use_cuda: :param str save_path: file path to save models :param Optimizer optimizer: an optimizer object - :param int check_code_level: level of FastNLP code checker. 0: ignore. 1: warning. 2: strict. + :param int check_code_level: level of FastNLP code checker. -1: don't check, 0: ignore. 1: warning. 2: strict. :param str metric_key: a single indicator used to decide the best model based on metric results. It must be one of the keys returned by the FIRST metric in `metrics`. If the overall result gets better if the indicator gets smaller, add a `-` character in front of the string. For example @@ -91,7 +91,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat if check_code_level > -1: _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, - check_level=check_code_level) + metric_key=metric_key, check_level=check_code_level) self.train_data = train_data self.dev_data = dev_data # If None, No validation. @@ -294,7 +294,7 @@ def _better_eval_result(self, metrics): def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, - dev_data=None, + dev_data=None, metric_key=None, check_level=0): # check get_loss 方法 model_devcie = model.parameters().__next__().device @@ -340,7 +340,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ tester = Tester(data=dataset[:batch_size * DEFAULT_CHECK_NUM_BATCH], model=model, metrics=metrics, batch_size=batch_size, verbose=-1) evaluate_results = tester.test() - # TODO 这里需要检查是否返回来的值是否是合理的 + _check_eval_results(metrics=evaluate_results, metric_key=metric_key, metric_list=metrics) def _check_eval_results(metrics, metric_key, metric_list): From a90a62ab9bad71670e6ac580d3be9336a44ce169 Mon Sep 17 00:00:00 2001 From: yh Date: Sun, 2 Dec 2018 14:28:44 +0800 Subject: [PATCH 25/67] metric bug fix --- fastNLP/core/losses.py | 2 +- fastNLP/core/metrics.py | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 564eb7ce..b1628ec8 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -112,7 +112,7 @@ def __init__(self): class BCELoss(LossBase): - def __init__(self): + def __init__(self, input=None, target=None): super(BCELoss, self).__init__() self.get_loss = F.binary_cross_entropy diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 5296b0bf..6b5fcb3c 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -124,22 +124,22 @@ def __init__(self, predictions=None, targets=None, masks=None, seq_lens=None): self.total = 0 self.acc_count = 0 - def evaluate(self, predictions, targets, masks=None, seq_lens=None): + def evaluate(self, input, targets, masks=None, seq_lens=None): """ - :param predictions: List of (torch.Tensor, or numpy.ndarray). Element's shape can be: + :param input: List of (torch.Tensor, or numpy.ndarray). Element's shape can be: torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), torch.Size([B, max_len, n_classes]) :param targets: List of (torch.Tensor, or numpy.ndarray). Element's can be: - torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), torch.Size([B, max_len]) + torch.Size([B,]), torch.Size([B,]), torch.Size([B, max_len]), torch.Size([B, max_len]) :param masks: List of (torch.Tensor, or numpy.ndarray). Element's can be: None, None, torch.Size([B, max_len], torch.Size([B, max_len]) :param seq_lens: List of (torch.Tensor, or numpy.ndarray). Element's can be: None, None, torch.Size([B], torch.Size([B]). ignored if masks are provided. :return: dict({'acc': float}) """ - if not isinstance(predictions, torch.Tensor): + if not isinstance(input, torch.Tensor): raise NameError(f"`predictions` in {get_func_signature(self.evaluate())} expects torch.Tensor," - f"got {type(predictions)}.") + f"got {type(input)}.") if not isinstance(targets, torch.Tensor): raise NameError(f"`targets` in {get_func_signature(self.evaluate())} expects torch.Tensor," f"got {type(targets)}.") @@ -154,21 +154,21 @@ def evaluate(self, predictions, targets, masks=None, seq_lens=None): if masks is None and seq_lens is not None: masks = seq_lens_to_masks(seq_lens=seq_lens, float=True) - if predictions.size()==targets.size(): + if input.size()==targets.size(): pass - elif len(predictions.size())==len(targets.size())+1: - predictions = predictions.argmax(dim=-1) + elif len(input.size())==len(targets.size())+1: + predictions = input.argmax(dim=-1) else: raise RuntimeError(f"In {get_func_signature(self.evaluate())}, when predictions with " - f"size:{predictions.size()}, targets should with size: {predictions.size()} or " - f"{predictions.size()[:-1]}, got {targets.size()}.") + f"size:{input.size()}, targets should with size: {input.size()} or " + f"{input.size()[:-1]}, got {targets.size()}.") if masks is not None: - self.acc_count += torch.sum(torch.eq(predictions, targets).float() * masks.float()).item() + self.acc_count += torch.sum(torch.eq(input, targets).float() * masks.float()).item() self.total += torch.sum(masks.float()).item() else: - self.acc_count += torch.sum(torch.eq(predictions, targets).float()).item() - self.total += np.prod(list(torch.size(predictions))) + self.acc_count += torch.sum(torch.eq(input, targets).float()).item() + self.total += np.prod(list(input.size())) def get_metric(self, reset=True): evaluate_result = {'acc': self.acc_count/self.total} From 50f1c28b74c0cbd1595bdd3580ae7ec40afef007 Mon Sep 17 00:00:00 2001 From: yh Date: Sun, 2 Dec 2018 14:29:11 +0800 Subject: [PATCH 26/67] metric bug fix --- fastNLP/core/metrics.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 6b5fcb3c..0d83fe44 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -115,10 +115,10 @@ def __call__(self, output_dict, target_dict, check=False): class AccuracyMetric(MetricBase): - def __init__(self, predictions=None, targets=None, masks=None, seq_lens=None): + def __init__(self, input=None, targets=None, masks=None, seq_lens=None): super().__init__() - self._init_param_map(predictions=predictions, targets=targets, + self._init_param_map(input=input, targets=targets, masks=masks, seq_lens=seq_lens) self.total = 0 @@ -138,7 +138,7 @@ def evaluate(self, input, targets, masks=None, seq_lens=None): :return: dict({'acc': float}) """ if not isinstance(input, torch.Tensor): - raise NameError(f"`predictions` in {get_func_signature(self.evaluate())} expects torch.Tensor," + raise NameError(f"`input` in {get_func_signature(self.evaluate())} expects torch.Tensor," f"got {type(input)}.") if not isinstance(targets, torch.Tensor): raise NameError(f"`targets` in {get_func_signature(self.evaluate())} expects torch.Tensor," @@ -157,9 +157,9 @@ def evaluate(self, input, targets, masks=None, seq_lens=None): if input.size()==targets.size(): pass elif len(input.size())==len(targets.size())+1: - predictions = input.argmax(dim=-1) + input = input.argmax(dim=-1) else: - raise RuntimeError(f"In {get_func_signature(self.evaluate())}, when predictions with " + raise RuntimeError(f"In {get_func_signature(self.evaluate())}, when input with " f"size:{input.size()}, targets should with size: {input.size()} or " f"{input.size()[:-1]}, got {targets.size()}.") From 8d7d2b428cce4f7b8c8be12ca74810544c56e048 Mon Sep 17 00:00:00 2001 From: yh Date: Sun, 2 Dec 2018 14:57:11 +0800 Subject: [PATCH 27/67] initial test for AccuracyMetric --- fastNLP/core/metrics.py | 60 ++++++++++++++++++++++++++------------- fastNLP/core/utils.py | 2 +- test/core/test_metrics.py | 17 +++++++++++ 3 files changed, 59 insertions(+), 20 deletions(-) create mode 100644 test/core/test_metrics.py diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 0d83fe44..6b8386c8 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -54,14 +54,32 @@ def _init_param_map(self, key_map=None, **kwargs): if len(key_set)>1: raise ValueError(f"Several params:{key_set} are provided with one output {value}.") + # check consistence between signature and param_map + func_spect = inspect.getfullargspec(self.evaluate) + func_args = func_spect.args + for func_param, input_param in self.param_map.items(): + if func_param not in func_args: + raise NameError(f"`{func_param}` not in {get_func_signature(self.evaluate)}. Please check the " + f"initialization params, or change {get_func_signature(self.evaluate)} signature.") + def get_metric(self, reset=True): raise NotImplemented def __call__(self, output_dict, target_dict, check=False): """ - :param output_dict: - :param target_dict: - :param check: boolean, + + This method will call self.evaluate method. + Before calling self.evaluate, it will first check the validity ofoutput_dict, target_dict + (1) whether self.evaluate has varargs, which is not supported. + (2) whether params needed by self.evaluate is not included in output_dict,target_dict. + (3) whether params needed by self.evaluate duplicate in output_dict, target_dict + (4) whether params in output_dict, target_dict are not used by evaluate.(Might cause warning) + Besides, before passing params into self.evaluate, this function will filter out params from output_dict and + target_dict which are not used in self.evaluate. (but if **kwargs presented in self.evaluate, no filtering + will be conducted) + :param output_dict: usually the output of forward or prediction function + :param target_dict: usually features set as target.. + :param check: boolean, if check is True, it will force check `varargs, missing, unsed, duplicated`. :return: """ if not callable(self.evaluate): @@ -73,7 +91,7 @@ def __call__(self, output_dict, target_dict, check=False): func_args = func_spect.args for func_param, input_param in self.param_map.items(): if func_param not in func_args: - raise NameError(f"{func_param} not in {get_func_signature(self.evaluate)}.") + raise NameError(f"`{func_param}` not in {get_func_signature(self.evaluate)}.") # 2. only part of the param_map are passed, left are not for arg in func_args: if arg not in self.param_map: @@ -97,8 +115,9 @@ def __call__(self, output_dict, target_dict, check=False): # check duplicated, unused, missing if check or not self._checked: - check_res = _check_arg_dict_list(self.evaluate, [mapped_output_dict, mapped_output_dict]) - for key, value in check_res.items(): + check_res = _check_arg_dict_list(self.evaluate, [mapped_output_dict, mapped_target_dict]) + for key in check_res._fields: + value = getattr(check_res, key) new_value = list(value) for idx, func_param in enumerate(value): if func_param in self._reverse_param_map: @@ -115,21 +134,21 @@ def __call__(self, output_dict, target_dict, check=False): class AccuracyMetric(MetricBase): - def __init__(self, input=None, targets=None, masks=None, seq_lens=None): + def __init__(self, input=None, target=None, masks=None, seq_lens=None): super().__init__() - self._init_param_map(input=input, targets=targets, + self._init_param_map(input=input, target=target, masks=masks, seq_lens=seq_lens) self.total = 0 self.acc_count = 0 - def evaluate(self, input, targets, masks=None, seq_lens=None): + def evaluate(self, input, target, masks=None, seq_lens=None): """ :param input: List of (torch.Tensor, or numpy.ndarray). Element's shape can be: torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), torch.Size([B, max_len, n_classes]) - :param targets: List of (torch.Tensor, or numpy.ndarray). Element's can be: + :param target: List of (torch.Tensor, or numpy.ndarray). Element's can be: torch.Size([B,]), torch.Size([B,]), torch.Size([B, max_len]), torch.Size([B, max_len]) :param masks: List of (torch.Tensor, or numpy.ndarray). Element's can be: None, None, torch.Size([B, max_len], torch.Size([B, max_len]) @@ -140,9 +159,9 @@ def evaluate(self, input, targets, masks=None, seq_lens=None): if not isinstance(input, torch.Tensor): raise NameError(f"`input` in {get_func_signature(self.evaluate())} expects torch.Tensor," f"got {type(input)}.") - if not isinstance(targets, torch.Tensor): - raise NameError(f"`targets` in {get_func_signature(self.evaluate())} expects torch.Tensor," - f"got {type(targets)}.") + if not isinstance(target, torch.Tensor): + raise NameError(f"`target` in {get_func_signature(self.evaluate())} expects torch.Tensor," + f"got {type(target)}.") if masks is not None and not isinstance(masks, torch.Tensor): raise NameError(f"`masks` in {get_func_signature(self.evaluate())} expects torch.Tensor," @@ -154,20 +173,23 @@ def evaluate(self, input, targets, masks=None, seq_lens=None): if masks is None and seq_lens is not None: masks = seq_lens_to_masks(seq_lens=seq_lens, float=True) - if input.size()==targets.size(): + if input.size()==target.size(): pass - elif len(input.size())==len(targets.size())+1: + elif len(input.size())==len(target.size())+1: input = input.argmax(dim=-1) else: raise RuntimeError(f"In {get_func_signature(self.evaluate())}, when input with " - f"size:{input.size()}, targets should with size: {input.size()} or " - f"{input.size()[:-1]}, got {targets.size()}.") + f"size:{input.size()}, target should with size: {input.size()} or " + f"{input.size()[:-1]}, got {target.size()}.") + + input = input.float() + target = target.float() if masks is not None: - self.acc_count += torch.sum(torch.eq(input, targets).float() * masks.float()).item() + self.acc_count += torch.sum(torch.eq(input, target).float() * masks.float()).item() self.total += torch.sum(masks.float()).item() else: - self.acc_count += torch.sum(torch.eq(input, targets).float()).item() + self.acc_count += torch.sum(torch.eq(input, target).float()).item() self.total += np.prod(list(input.size())) def get_metric(self, reset=True): diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 08640d0f..62f60cf7 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -123,7 +123,7 @@ def _check_arg_dict_list(func, args): input_args = set(input_arg_count.keys()) missing = list(require_args - input_args) unused = list(input_args - all_args) - varargs = [] if spect.varargs else [arg for arg in spect.varargs] + varargs = [] if not spect.varargs else [arg for arg in spect.varargs] return CheckRes(missing=missing, unused=unused, duplicated=duplicated, diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py new file mode 100644 index 00000000..b279d7ca --- /dev/null +++ b/test/core/test_metrics.py @@ -0,0 +1,17 @@ + +import unittest + +class TestOptim(unittest.TestCase): + def test_AccuracyMetric(self): + from fastNLP.core.metrics import AccuracyMetric + import torch + import numpy as np + + # (1) only input, targets passed + output_dict = {"input": torch.zeros(4, 3)} + target_dict = {'target': torch.zeros(4)} + metric = AccuracyMetric() + + metric(output_dict=output_dict, target_dict=target_dict) + print(metric.get_metric()) + From c2d2137500bf9e4c69494e3857ce50a9d5ec8e42 Mon Sep 17 00:00:00 2001 From: yh Date: Sun, 2 Dec 2018 15:19:05 +0800 Subject: [PATCH 28/67] bug fix in MetricAccuracy --- fastNLP/core/metrics.py | 47 +++++++++++++-------------- test/core/test_metrics.py | 67 +++++++++++++++++++++++++++++++++++---- 2 files changed, 85 insertions(+), 29 deletions(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 6b8386c8..ee074feb 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -52,15 +52,16 @@ def _init_param_map(self, key_map=None, **kwargs): value_counter[value].add(key) for value, key_set in value_counter.items(): if len(key_set)>1: - raise ValueError(f"Several params:{key_set} are provided with one output {value}.") + raise ValueError(f"Several parameters:{key_set} are provided with one output {value}.") # check consistence between signature and param_map func_spect = inspect.getfullargspec(self.evaluate) func_args = func_spect.args for func_param, input_param in self.param_map.items(): if func_param not in func_args: - raise NameError(f"`{func_param}` not in {get_func_signature(self.evaluate)}. Please check the " - f"initialization params, or change {get_func_signature(self.evaluate)} signature.") + raise NameError(f"Parameter `{func_param}` is not in {get_func_signature(self.evaluate)}. Please check the " + f"initialization parameters, or change the signature of" + f" {get_func_signature(self.evaluate)}.") def get_metric(self, reset=True): raise NotImplemented @@ -134,19 +135,19 @@ def __call__(self, output_dict, target_dict, check=False): class AccuracyMetric(MetricBase): - def __init__(self, input=None, target=None, masks=None, seq_lens=None): + def __init__(self, pred=None, target=None, masks=None, seq_lens=None): super().__init__() - self._init_param_map(input=input, target=target, + self._init_param_map(pred=pred, target=target, masks=masks, seq_lens=seq_lens) self.total = 0 self.acc_count = 0 - def evaluate(self, input, target, masks=None, seq_lens=None): + def evaluate(self, pred, target, masks=None, seq_lens=None): """ - :param input: List of (torch.Tensor, or numpy.ndarray). Element's shape can be: + :param pred: List of (torch.Tensor, or numpy.ndarray). Element's shape can be: torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), torch.Size([B, max_len, n_classes]) :param target: List of (torch.Tensor, or numpy.ndarray). Element's can be: torch.Size([B,]), torch.Size([B,]), torch.Size([B, max_len]), torch.Size([B, max_len]) @@ -156,41 +157,41 @@ def evaluate(self, input, target, masks=None, seq_lens=None): None, None, torch.Size([B], torch.Size([B]). ignored if masks are provided. :return: dict({'acc': float}) """ - if not isinstance(input, torch.Tensor): - raise NameError(f"`input` in {get_func_signature(self.evaluate())} expects torch.Tensor," - f"got {type(input)}.") + if not isinstance(pred, torch.Tensor): + raise TypeError(f"`pred` in {get_func_signature(self.evaluate)} must be torch.Tensor," + f"got {type(pred)}.") if not isinstance(target, torch.Tensor): - raise NameError(f"`target` in {get_func_signature(self.evaluate())} expects torch.Tensor," + raise TypeError(f"`target` in {get_func_signature(self.evaluate)} must be torch.Tensor," f"got {type(target)}.") if masks is not None and not isinstance(masks, torch.Tensor): - raise NameError(f"`masks` in {get_func_signature(self.evaluate())} expects torch.Tensor," + raise TypeError(f"`masks` in {get_func_signature(self.evaluate)} must be torch.Tensor," f"got {type(masks)}.") elif seq_lens is not None and not isinstance(seq_lens, torch.Tensor): - raise NameError(f"`seq_lens` in {get_func_signature(self.evaluate())} expects torch.Tensor," + raise TypeError(f"`seq_lens` in {get_func_signature(self.evaluate)} must be torch.Tensor," f"got {type(seq_lens)}.") if masks is None and seq_lens is not None: masks = seq_lens_to_masks(seq_lens=seq_lens, float=True) - if input.size()==target.size(): + if pred.size()==target.size(): pass - elif len(input.size())==len(target.size())+1: - input = input.argmax(dim=-1) + elif len(pred.size())==len(target.size())+1: + pred = pred.argmax(dim=-1) else: - raise RuntimeError(f"In {get_func_signature(self.evaluate())}, when input with " - f"size:{input.size()}, target should with size: {input.size()} or " - f"{input.size()[:-1]}, got {target.size()}.") + raise RuntimeError(f"In {get_func_signature(self.evaluate)}, when pred have " + f"size:{pred.size()}, target should have size: {pred.size()} or " + f"{pred.size()[:-1]}, got {target.size()}.") - input = input.float() + pred = pred.float() target = target.float() if masks is not None: - self.acc_count += torch.sum(torch.eq(input, target).float() * masks.float()).item() + self.acc_count += torch.sum(torch.eq(pred, target).float() * masks.float()).item() self.total += torch.sum(masks.float()).item() else: - self.acc_count += torch.sum(torch.eq(input, target).float()).item() - self.total += np.prod(list(input.size())) + self.acc_count += torch.sum(torch.eq(pred, target).float()).item() + self.total += np.prod(list(pred.size())) def get_metric(self, reset=True): evaluate_result = {'acc': self.acc_count/self.total} diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py index b279d7ca..bad3ebba 100644 --- a/test/core/test_metrics.py +++ b/test/core/test_metrics.py @@ -1,17 +1,72 @@ import unittest -class TestOptim(unittest.TestCase): - def test_AccuracyMetric(self): - from fastNLP.core.metrics import AccuracyMetric - import torch - import numpy as np +from fastNLP.core.metrics import AccuracyMetric +import torch +import numpy as np +class TestAccuracyMetric(unittest.TestCase): + def test_AccuracyMetric1(self): # (1) only input, targets passed - output_dict = {"input": torch.zeros(4, 3)} + output_dict = {"pred": torch.zeros(4, 3)} target_dict = {'target': torch.zeros(4)} metric = AccuracyMetric() metric(output_dict=output_dict, target_dict=target_dict) print(metric.get_metric()) + def test_AccuracyMetric2(self): + # (2) with corrupted size + output_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4)} + metric = AccuracyMetric() + + metric(output_dict=output_dict, target_dict=target_dict) + print(metric.get_metric()) + + def test_AccuracyMetric3(self): + # (3) with check=False , the second batch is corrupted size + metric = AccuracyMetric() + output_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4, 3)} + metric(output_dict=output_dict, target_dict=target_dict) + + output_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4)} + metric(output_dict=output_dict, target_dict=target_dict) + + print(metric.get_metric()) + + def test_AccuracyMetric4(self): + # (4) with check=True , the second batch is corrupted size + metric = AccuracyMetric() + output_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4, 3)} + metric(output_dict=output_dict, target_dict=target_dict) + + output_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4)} + metric(output_dict=output_dict, target_dict=target_dict, check=True) + + print(metric.get_metric()) + + def test_AccuaryMetric5(self): + # (5) check reset + metric = AccuracyMetric() + output_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4, 3)} + metric(output_dict=output_dict, target_dict=target_dict) + self.assertDictEqual(metric.get_metric(), {'acc': 1}) + + output_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4, 3)+1} + metric(output_dict=output_dict, target_dict=target_dict) + self.assertDictEqual(metric.get_metric(), {'acc':0}) + + def test_AccuaryMetric6(self): + # (6) check numpy array is not acceptable + metric = AccuracyMetric() + output_dict = {"pred": np.zeros((4, 3, 2))} + target_dict = {'target': np.zeros((4, 3))} + metric(output_dict=output_dict, target_dict=target_dict) + self.assertDictEqual(metric.get_metric(), {'acc': 1}) \ No newline at end of file From 125c2718e428c7cc9607db161fcd0bd90983780d Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sun, 2 Dec 2018 16:38:38 +0800 Subject: [PATCH 29/67] Update * fix bug in DataSet.split * fix bugs in FieldArray, to allow content as a list * fix bug in losses check * ... --- fastNLP/core/dataset.py | 6 +++++ fastNLP/core/fieldarray.py | 23 ++++++++++++++---- fastNLP/core/losses.py | 11 +++++---- fastNLP/core/metrics.py | 11 +++++---- fastNLP/core/tester.py | 31 ++++++++++++------------ fastNLP/core/trainer.py | 9 ++++--- fastNLP/core/utils.py | 6 ++--- fastNLP/models/base_model.py | 18 ++++++++++---- test/core/test_loss.py | 21 ++++++++-------- test/core/test_trainer.py | 46 +++++++++++++++++++++++++++++++++--- 10 files changed, 129 insertions(+), 53 deletions(-) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 920e9f11..6d2a94d6 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -260,6 +260,12 @@ def split(self, dev_ratio): dev_set.append(self[idx]) for idx in train_indices: train_set.append(self[idx]) + for field_name in self.field_arrays: + train_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input + train_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target + dev_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input + dev_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target + return train_set, dev_set @classmethod diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index 714fa169..976dc2c6 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -11,7 +11,7 @@ def __init__(self, name, content, padding_val=0, is_target=False, is_input=False """ :param str name: the name of the FieldArray - :param list content: a list of int, float, or other objects. + :param list content: a list of int, float, or a list of list. :param int padding_val: the integer for padding. Default: 0. :param bool is_target: If True, this FieldArray is used to compute loss. :param bool is_input: If True, this FieldArray is used to the model input. @@ -26,7 +26,14 @@ def __init__(self, name, content, padding_val=0, is_target=False, is_input=False @staticmethod def _type_detection(content): - type_set = set([type(item) for item in content]) + + if isinstance(content, list) and len(content) > 0 and isinstance(content[0], list): + # 2-D list + # TODO: refactor + type_set = set([type(item) for item in content[0]]) + else: + # 1-D list + type_set = set([type(item) for item in content]) if len(type_set) == 1 and any(basic_type in type_set for basic_type in (str, int, float)): return type_set.pop() elif len(type_set) == 2 and float in type_set and int in type_set: @@ -48,7 +55,7 @@ def __repr__(self): def append(self, val): """Add a new item to the tail of FieldArray. - :param val: int, float, or str. + :param val: int, float, str, or a list of them. """ val_type = type(val) if val_type is int and self.pytype is float: @@ -60,9 +67,17 @@ def append(self, val): self.content[idx] = float(self.content[idx]) self.pytype = float self.dtype = self._map_to_np_type(self.pytype) - + elif val_type is list: + if len(val) == 0: + raise ValueError("Cannot append an empty list.") + else: + if type(val[0]) != self.pytype: + raise ValueError( + "Cannot append a list of {}-type value into a {}-tpye FieldArray.". + format(type(val[0]), self.pytype)) elif val_type != self.pytype: raise ValueError("Cannot append a {}-type value into a {}-tpye FieldArray.".format(val_type, self.pytype)) + self.content.append(val) def __getitem__(self, indices): diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index b1628ec8..981bef89 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -3,11 +3,11 @@ from fastNLP.core.utils import CheckError from fastNLP.core.utils import CheckRes +from fastNLP.core.utils import _build_args +from fastNLP.core.utils import _check_function_or_method from fastNLP.core.utils import _get_arg_list from fastNLP.core.utils import _map_args from fastNLP.core.utils import get_func_signature -from fastNLP.core.utils import _build_args -from fastNLP.core.utils import _check_function_or_method class LossBase(object): @@ -71,7 +71,8 @@ def __call__(self, output_dict, target_dict, force_check=False): if len(duplicated) > 0 or len(missing) > 0: raise CheckError( - CheckRes(missing=missing, unused=[], duplicated=duplicated, required=[], all_needed=[]), + CheckRes(missing=missing, unused=[], duplicated=duplicated, required=[], all_needed=[], + varargs=varargs), func_signature=get_func_signature(self.get_loss) ) @@ -90,9 +91,9 @@ def __call__(self, output_dict, target_dict, force_check=False): return loss -class NewLoss(LossBase): +class LossFunc(LossBase): def __init__(self, func, key_map=None, **kwargs): - super(NewLoss, self).__init__() + super(LossFunc, self).__init__() _check_function_or_method(func) if key_map is not None: if not isinstance(key_map, dict): diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index ee074feb..34d438e7 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -1,17 +1,18 @@ -import warnings import inspect +import warnings from collections import defaultdict import numpy as np import torch -from fastNLP.core.utils import get_func_signature -from fastNLP.core.utils import _check_arg_dict_list -from fastNLP.core.utils import _build_args from fastNLP.core.utils import CheckError +from fastNLP.core.utils import _build_args +from fastNLP.core.utils import _check_arg_dict_list +from fastNLP.core.utils import get_func_signature from fastNLP.core.utils import seq_lens_to_masks + class MetricBase(object): def __init__(self): self.param_map = {} # key is param in function, value is input param. @@ -46,7 +47,7 @@ def _init_param_map(self, key_map=None, **kwargs): if value is None: self.param_map[key] = key continue - if isinstance(value, str): + if not isinstance(value, str): raise TypeError(f"in {key}={value}, value must be `str`, not `{type(value)}`.") self.param_map[key] = value value_counter[value].add(key) diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index f62d9337..0c3bcefb 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -1,18 +1,18 @@ -import itertools from collections import defaultdict import torch from torch import nn from fastNLP.core.batch import Batch -from fastNLP.core.sampler import SequentialSampler from fastNLP.core.dataset import DataSet +from fastNLP.core.metrics import _prepare_metrics +from fastNLP.core.sampler import SequentialSampler from fastNLP.core.utils import CheckError from fastNLP.core.utils import _build_args -from fastNLP.core.utils import get_func_signature -from fastNLP.core.utils import _move_dict_value_to_device -from fastNLP.core.metrics import _prepare_metrics from fastNLP.core.utils import _check_loss_evaluate +from fastNLP.core.utils import _move_dict_value_to_device +from fastNLP.core.utils import get_func_signature + class Tester(object): """An collection of model inference and evaluation of performance, used over validation/dev set and test set. """ @@ -27,16 +27,6 @@ def __init__(self, data, model, metrics, batch_size=16, use_cuda=False, verbose= self.metrics = _prepare_metrics(metrics) - # check predict - if hasattr(self._model, 'predict'): - self._predict_func = self._model.predict - if not callable(self._predict_func): - _model_name = model.__class__.__name__ - raise TypeError(f"`{_model_name}.predict` must be callable to be used " - f"for evaluation, not `{type(self._predict_func)}`.") - else: - self._predict_func = self._model.forward - self.data = data if torch.cuda.is_available() and self.use_cuda: self._model = model.cuda() @@ -45,9 +35,18 @@ def __init__(self, data, model, metrics, batch_size=16, use_cuda=False, verbose= self.use_cuda = use_cuda self.batch_size = batch_size self.verbose = verbose - self._model_device = model.parameters().__next__().device + # check predict + if hasattr(self._model, 'predict'): + self._predict_func = self._model.predict + if not callable(self._predict_func): + _model_name = model.__class__.__name__ + raise TypeError(f"`{_model_name}.predict` must be callable to be used " + f"for evaluation, not `{type(self._predict_func)}`.") + else: + self._predict_func = self._model.forward + def test(self): # turn on the testing mode; clean up the history network = self._model diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 2c57057f..2cf18b90 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -80,8 +80,9 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat # parse metric_key # increase_better is True. It means the exp result gets better if the indicator increases. # It is true by default. - self.increase_better = False if metric_key[0] == "-" else True + self.increase_better = True if metric_key is not None: + self.increase_better = False if metric_key[0] == "-" else True self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key else: self.metric_key = None @@ -208,10 +209,12 @@ def _train_epoch(self, data_iterator, model, epoch, start): def _do_validation(self): res = self.tester.test() for name, num in res.items(): - self._summary_writer.add_scalar("valid_{}".format(name), num, global_step=self.step) + pass + # self._summary_writer.add_scalar("valid_{}".format(name), num, global_step=self.step) if self.save_path is not None and self._better_eval_result(res): + metric_key = self.metric_key if self.metric_key is not None else "None" self._save_model(self.model, - "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time])) + "best_" + "_".join([self.model.__class__.__name__, metric_key, self.start_time])) def _mode(self, model, is_test=False): """Train mode or Test mode. This is for PyTorch currently. diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 62f60cf7..c9cd7c03 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -5,9 +5,8 @@ from collections import Counter from collections import namedtuple -import torch import numpy as np - +import torch CheckRes = namedtuple('CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', 'varargs'], verbose=False) @@ -266,7 +265,8 @@ def _check_forward_error(forward_func, batch_x, check_level): if check_res.varargs: errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, please delete it)") if check_res.missing: - errs.append(f"\tmissing param: {check_res.missing}, provided with {list(batch_x.keys())}.") + errs.append(f"\tmissing param: {check_res.missing}, provided with {list(batch_x.keys())}. " + f"Please set {check_res.missing} as input.") if check_res.unused: _unused = [f"\tunused param: {check_res.unused}"] if check_level == STRICT_CHECK_LEVEL: diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py index 829f7c9c..09274d2d 100644 --- a/fastNLP/models/base_model.py +++ b/fastNLP/models/base_model.py @@ -1,7 +1,5 @@ import torch -from fastNLP.core.trainer import Trainer - class BaseModel(torch.nn.Module): """Base PyTorch model for all models. @@ -11,8 +9,20 @@ def __init__(self): super(BaseModel, self).__init__() def fit(self, train_data, dev_data=None, **train_args): - trainer = Trainer(**train_args) - trainer.train(self, train_data, dev_data) + raise NotImplementedError def predict(self, *args, **kwargs): raise NotImplementedError + + +class LinearClassifier(BaseModel): + def __init__(self, in_feature_dim, out_feature_dim): + super(LinearClassifier, self).__init__() + self.linear = torch.nn.Linear(in_feature_dim, out_feature_dim) + self.softmax = torch.nn.Softmax() + + def forward(self, x): + return {"predict": self.softmax(self.linear(x))} + + def predict(self, x): + return {"predict": self.softmax(self.linear(x))} diff --git a/test/core/test_loss.py b/test/core/test_loss.py index fddc56e9..edff342d 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -16,7 +16,8 @@ def test_case_1(self): # loss_func = loss.Loss("nll") print(callable(tc.nn.NLLLoss)) - loss_func = loss.NewLoss(F.nll_loss) + + loss_func = loss.LossFunc(F.nll_loss) nll_loss = loss.NLLLoss() @@ -330,36 +331,36 @@ def func6(a, b, **kwargs): c = kwargs['c'] return (a + b) * c - import torch - from fastNLP.core.losses import LossBase, NewLoss - get_loss = NewLoss(func, {'a': 'predict', 'b': 'truth'}) +from fastNLP.core.losses import LossFunc + +get_loss = LossFunc(func, {'a': 'predict', 'b': 'truth'}) predict = torch.randn(5, 3) truth = torch.LongTensor([1, 0, 1, 2, 1]) loss1 = get_loss({'predict': predict}, {'truth': truth}) - get_loss_2 = NewLoss(func2, {'a': 'predict'}) +get_loss_2 = LossFunc(func2, {'a': 'predict'}) loss2 = get_loss_2({'predict': predict}, {'truth': truth}) - get_loss_3 = NewLoss(func3) +get_loss_3 = LossFunc(func3) loss3 = get_loss_3({'predict': predict}, {'truth': truth}) print(loss1, loss2, loss3) assert loss1 == loss2 and loss1 == loss3 - get_loss_4 = NewLoss(func4) +get_loss_4 = LossFunc(func4) loss4 = get_loss_4({'a': 1, 'b': 3}, {}) print(loss4) assert loss4 == (1 + 3) * 2 - get_loss_5 = NewLoss(func4) +get_loss_5 = LossFunc(func4) loss5 = get_loss_5({'a': 1, 'b': 3}, {'c': 4}) print(loss5) assert loss5 == (1 + 3) * 4 - get_loss_6 = NewLoss(func6) +get_loss_6 = LossFunc(func6) loss6 = get_loss_6({'a': 1, 'b': 3}, {'c': 4}) print(loss6) assert loss6 == (1 + 3) * 4 - get_loss_7 = NewLoss(func6, c='cc') +get_loss_7 = LossFunc(func6, c='cc') loss7 = get_loss_7({'a': 1, 'b': 3}, {'cc': 4}) print(loss7) assert loss7 == (1 + 3) * 4 diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index 08df6a49..0194d254 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -1,7 +1,47 @@ import unittest +import numpy as np +import torch -class TestTrainer(unittest.TestCase): - def test_case_1(self): - pass +from fastNLP.core.dataset import DataSet +from fastNLP.core.instance import Instance +from fastNLP.core.losses import LossFunc +from fastNLP.core.metrics import AccuracyMetric +from fastNLP.core.optimizer import SGD +from fastNLP.core.trainer import Trainer +from fastNLP.models.base_model import LinearClassifier + +class TrainerTestGround(unittest.TestCase): + def test_case(self): + mean = np.array([-3, -3]) + cov = np.array([[1, 0], [0, 1]]) + class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) + + mean = np.array([3, 3]) + cov = np.array([[1, 0], [0, 1]]) + class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) + + data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + + [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) + + data_set.set_input("x", flag=True) + data_set.set_target("y", flag=True) + + train_set, dev_set = data_set.split(0.3) + + model = LinearClassifier(2, 1) + + trainer = Trainer(train_set, model, + losser=LossFunc(torch.nn.functional.binary_cross_entropy, + key_map={"target": "y", "input": "predict"}), + metrics=AccuracyMetric(pred="predict", target="y"), + n_epochs=10, + batch_size=32, + print_every=10, + validate_every=-1, + dev_data=dev_set, + optimizer=SGD(0.001), + check_code_level=2 + ) + trainer.train() From 234ceb6fa3c6eb12372c58c5b8b79530332b4119 Mon Sep 17 00:00:00 2001 From: yh Date: Sun, 2 Dec 2018 16:39:28 +0800 Subject: [PATCH 30/67] fix bug in MetricBase --- fastNLP/core/metrics.py | 48 +++++----- test/core/test_metrics.py | 178 +++++++++++++++++++++++++------------- 2 files changed, 144 insertions(+), 82 deletions(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index ee074feb..595783f7 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -46,7 +46,7 @@ def _init_param_map(self, key_map=None, **kwargs): if value is None: self.param_map[key] = key continue - if isinstance(value, str): + if not isinstance(value, str): raise TypeError(f"in {key}={value}, value must be `str`, not `{type(value)}`.") self.param_map[key] = value value_counter[value].add(key) @@ -56,17 +56,22 @@ def _init_param_map(self, key_map=None, **kwargs): # check consistence between signature and param_map func_spect = inspect.getfullargspec(self.evaluate) - func_args = func_spect.args + func_args = [arg for arg in func_spect.args if arg!='self'] for func_param, input_param in self.param_map.items(): if func_param not in func_args: raise NameError(f"Parameter `{func_param}` is not in {get_func_signature(self.evaluate)}. Please check the " f"initialization parameters, or change the signature of" f" {get_func_signature(self.evaluate)}.") + # evaluate should not have varargs. + if func_spect.varargs: + raise NameError(f"Delete `*{func_spect.varargs}` in {get_func_signature(self.evaluate)}(Do not use " + f"positional argument.).") + def get_metric(self, reset=True): raise NotImplemented - def __call__(self, output_dict, target_dict, check=False): + def __call__(self, pred_dict, target_dict, check=False): """ This method will call self.evaluate method. @@ -78,7 +83,7 @@ def __call__(self, output_dict, target_dict, check=False): Besides, before passing params into self.evaluate, this function will filter out params from output_dict and target_dict which are not used in self.evaluate. (but if **kwargs presented in self.evaluate, no filtering will be conducted) - :param output_dict: usually the output of forward or prediction function + :param pred_dict: usually the output of forward or prediction function :param target_dict: usually features set as target.. :param check: boolean, if check is True, it will force check `varargs, missing, unsed, duplicated`. :return: @@ -89,46 +94,47 @@ def __call__(self, output_dict, target_dict, check=False): if not self._checked: # 1. check consistence between signature and param_map func_spect = inspect.getfullargspec(self.evaluate) - func_args = func_spect.args - for func_param, input_param in self.param_map.items(): - if func_param not in func_args: - raise NameError(f"`{func_param}` not in {get_func_signature(self.evaluate)}.") + func_args = set([arg for arg in func_spect.args if arg!='self']) + for func_arg, input_arg in self.param_map.items(): + if func_arg not in func_args: + raise NameError(f"`{func_arg}` not in {get_func_signature(self.evaluate)}.") + # 2. only part of the param_map are passed, left are not for arg in func_args: if arg not in self.param_map: self.param_map[arg] = arg #This param does not need mapping. self._evaluate_args = func_args - self._reverse_param_map = {value: key for key, value in self.param_map.items()} + self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self.param_map.items()} # need to wrap inputs in dict. - mapped_output_dict = {} + mapped_pred_dict = {} mapped_target_dict = {} - for func_arg in self._evaluate_args: - input_arg = self.param_map[func_arg] + for input_arg in set(list(pred_dict.keys()) + list(target_dict.keys())): if input_arg in self._reverse_param_map: - mapped_arg = func_arg + mapped_arg = self._reverse_param_map[input_arg] else: mapped_arg = input_arg - if input_arg in output_dict: - mapped_output_dict[mapped_arg] = output_dict[input_arg] + if input_arg in pred_dict: + mapped_pred_dict[mapped_arg] = pred_dict[input_arg] if input_arg in target_dict: mapped_target_dict[mapped_arg] = target_dict[input_arg] # check duplicated, unused, missing if check or not self._checked: - check_res = _check_arg_dict_list(self.evaluate, [mapped_output_dict, mapped_target_dict]) + check_res = _check_arg_dict_list(self.evaluate, [mapped_pred_dict, mapped_target_dict]) for key in check_res._fields: value = getattr(check_res, key) new_value = list(value) - for idx, func_param in enumerate(value): - if func_param in self._reverse_param_map: - new_value[idx] = self._reverse_param_map[func_param] + f'(assign to {func_param})' + # TODO 这里报错的逻辑应该是怎样的? + for idx, func_arg in enumerate(value): + if func_arg in self.param_map: + new_value[idx] = self.param_map[func_arg] + f'(try to get value from {self.param_map[func_arg]})' else: - new_value[idx] = func_param + new_value[idx] = func_arg if check_res.missing or check_res.duplicated or check_res.varargs: raise CheckError(check_res=check_res, func_signature=get_func_signature(self.evaluate)) - refined_args = _build_args(self.evaluate, **mapped_output_dict, **mapped_target_dict) + refined_args = _build_args(self.evaluate, **mapped_pred_dict, **mapped_target_dict) self.evaluate(**refined_args) self._checked = True diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py index bad3ebba..c6a8523e 100644 --- a/test/core/test_metrics.py +++ b/test/core/test_metrics.py @@ -6,67 +6,123 @@ import numpy as np class TestAccuracyMetric(unittest.TestCase): - def test_AccuracyMetric1(self): - # (1) only input, targets passed - output_dict = {"pred": torch.zeros(4, 3)} - target_dict = {'target': torch.zeros(4)} - metric = AccuracyMetric() + # def test_AccuracyMetric1(self): + # # (1) only input, targets passed + # pred_dict = {"pred": torch.zeros(4, 3)} + # target_dict = {'target': torch.zeros(4)} + # metric = AccuracyMetric() + # + # metric(pred_dict=pred_dict, target_dict=target_dict) + # print(metric.get_metric()) + # + # def test_AccuracyMetric2(self): + # # (2) with corrupted size + # try: + # pred_dict = {"pred": torch.zeros(4, 3, 2)} + # target_dict = {'target': torch.zeros(4)} + # metric = AccuracyMetric() + # + # metric(pred_dict=pred_dict, target_dict=target_dict) + # print(metric.get_metric()) + # except Exception as e: + # print(e) + # return + # self.assertTrue(True, False), "No exception catches." + # + # def test_AccuracyMetric3(self): + # # (3) with check=False , the second batch is corrupted size + # try: + # metric = AccuracyMetric() + # pred_dict = {"pred": torch.zeros(4, 3, 2)} + # target_dict = {'target': torch.zeros(4, 3)} + # metric(pred_dict=pred_dict, target_dict=target_dict) + # + # pred_dict = {"pred": torch.zeros(4, 3, 2)} + # target_dict = {'target': torch.zeros(4)} + # metric(pred_dict=pred_dict, target_dict=target_dict) + # + # print(metric.get_metric()) + # except Exception as e: + # print(e) + # return + # self.assertTrue(True, False), "No exception catches." + # + # def test_AccuracyMetric4(self): + # # (4) with check=True , the second batch is corrupted size + # try: + # metric = AccuracyMetric() + # pred_dict = {"pred": torch.zeros(4, 3, 2)} + # target_dict = {'target': torch.zeros(4, 3)} + # metric(pred_dict=pred_dict, target_dict=target_dict) + # + # pred_dict = {"pred": torch.zeros(4, 3, 2)} + # target_dict = {'target': torch.zeros(4)} + # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) + # + # print(metric.get_metric()) + # + # except Exception as e: + # print(e) + # return + # self.assertTrue(True, False), "No exception catches." + # + # def test_AccuaryMetric5(self): + # # (5) check reset + # metric = AccuracyMetric() + # pred_dict = {"pred": torch.zeros(4, 3, 2)} + # target_dict = {'target': torch.zeros(4, 3)} + # metric(pred_dict=pred_dict, target_dict=target_dict) + # self.assertDictEqual(metric.get_metric(), {'acc': 1}) + # + # pred_dict = {"pred": torch.zeros(4, 3, 2)} + # target_dict = {'target': torch.zeros(4, 3)+1} + # metric(pred_dict=pred_dict, target_dict=target_dict) + # self.assertDictEqual(metric.get_metric(), {'acc':0}) + # + # def test_AccuaryMetric6(self): + # # (6) check numpy array is not acceptable + # try: + # metric = AccuracyMetric() + # pred_dict = {"pred": np.zeros((4, 3, 2))} + # target_dict = {'target': np.zeros((4, 3))} + # metric(pred_dict=pred_dict, target_dict=target_dict) + # self.assertDictEqual(metric.get_metric(), {'acc': 1}) + # except Exception as e: + # print(e) + # return + # self.assertTrue(True, False), "No exception catches." - metric(output_dict=output_dict, target_dict=target_dict) - print(metric.get_metric()) + # def test_AccuaryMetric7(self): + # # (7) check map, match + # metric = AccuracyMetric(pred='predictions', target='targets') + # pred_dict = {"predictions": torch.zeros(4, 3, 2)} + # target_dict = {'targets': torch.zeros(4, 3)} + # metric(pred_dict=pred_dict, target_dict=target_dict) + # self.assertDictEqual(metric.get_metric(), {'acc': 1}) + # + # def test_AccuaryMetric8(self): + # # (8) check map, does not match + # try: + # metric = AccuracyMetric(pred='predictions', target='targets') + # pred_dict = {"prediction": torch.zeros(4, 3, 2)} + # target_dict = {'targets': torch.zeros(4, 3)} + # metric(pred_dict=pred_dict, target_dict=target_dict) + # self.assertDictEqual(metric.get_metric(), {'acc': 1}) + # except Exception as e: + # print(e) + # return + # self.assertTrue(True, False), "No exception catches." - def test_AccuracyMetric2(self): - # (2) with corrupted size - output_dict = {"pred": torch.zeros(4, 3, 2)} - target_dict = {'target': torch.zeros(4)} - metric = AccuracyMetric() + def test_AccuaryMetric9(self): + # (9) check map, include unused + try: + metric = AccuracyMetric(pred='predictions', target='targets') + pred_dict = {"prediction": torch.zeros(4, 3, 2), 'unused':1} + target_dict = {'targets': torch.zeros(4, 3)} + metric(pred_dict=pred_dict, target_dict=target_dict) + self.assertDictEqual(metric.get_metric(), {'acc': 1}) + except Exception as e: + print(e) + return + self.assertTrue(True, False), "No exception catches." - metric(output_dict=output_dict, target_dict=target_dict) - print(metric.get_metric()) - - def test_AccuracyMetric3(self): - # (3) with check=False , the second batch is corrupted size - metric = AccuracyMetric() - output_dict = {"pred": torch.zeros(4, 3, 2)} - target_dict = {'target': torch.zeros(4, 3)} - metric(output_dict=output_dict, target_dict=target_dict) - - output_dict = {"pred": torch.zeros(4, 3, 2)} - target_dict = {'target': torch.zeros(4)} - metric(output_dict=output_dict, target_dict=target_dict) - - print(metric.get_metric()) - - def test_AccuracyMetric4(self): - # (4) with check=True , the second batch is corrupted size - metric = AccuracyMetric() - output_dict = {"pred": torch.zeros(4, 3, 2)} - target_dict = {'target': torch.zeros(4, 3)} - metric(output_dict=output_dict, target_dict=target_dict) - - output_dict = {"pred": torch.zeros(4, 3, 2)} - target_dict = {'target': torch.zeros(4)} - metric(output_dict=output_dict, target_dict=target_dict, check=True) - - print(metric.get_metric()) - - def test_AccuaryMetric5(self): - # (5) check reset - metric = AccuracyMetric() - output_dict = {"pred": torch.zeros(4, 3, 2)} - target_dict = {'target': torch.zeros(4, 3)} - metric(output_dict=output_dict, target_dict=target_dict) - self.assertDictEqual(metric.get_metric(), {'acc': 1}) - - output_dict = {"pred": torch.zeros(4, 3, 2)} - target_dict = {'target': torch.zeros(4, 3)+1} - metric(output_dict=output_dict, target_dict=target_dict) - self.assertDictEqual(metric.get_metric(), {'acc':0}) - - def test_AccuaryMetric6(self): - # (6) check numpy array is not acceptable - metric = AccuracyMetric() - output_dict = {"pred": np.zeros((4, 3, 2))} - target_dict = {'target': np.zeros((4, 3))} - metric(output_dict=output_dict, target_dict=target_dict) - self.assertDictEqual(metric.get_metric(), {'acc': 1}) \ No newline at end of file From 201f5109d6d34d848a79e32e6f45b9d3ae8ef66f Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sun, 2 Dec 2018 19:45:25 +0800 Subject: [PATCH 31/67] Updates: * improve Loss initialization interface * improve test codes for trainer --- fastNLP/core/losses.py | 68 +++- fastNLP/core/metrics.py | 15 +- fastNLP/models/base_model.py | 16 +- test/core/test_loss.py | 658 ++++++++++++++++------------------- test/core/test_trainer.py | 14 +- 5 files changed, 385 insertions(+), 386 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 981bef89..dce568bd 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -1,3 +1,6 @@ +import inspect +from collections import defaultdict + import torch import torch.nn.functional as F @@ -19,6 +22,54 @@ def __init__(self): def get_loss(self, *args, **kwargs): raise NotImplementedError + def _init_param_map(self, key_map=None, **kwargs): + """Check the validity of key_map and other param map. Add these into self.param_map + + :param key_map: dict + :param kwargs: + :return: None + """ + value_counter = defaultdict(set) + if key_map is not None: + if not isinstance(key_map, dict): + raise TypeError("key_map must be `dict`, got {}.".format(type(key_map))) + for key, value in key_map.items(): + if value is None: + self.param_map[key] = key + continue + if not isinstance(key, str): + raise TypeError(f"key in key_map must be `str`, not `{type(key)}`.") + if not isinstance(value, str): + raise TypeError(f"value in key_map must be `str`, not `{type(value)}`.") + self.param_map[key] = value + value_counter[value].add(key) + for key, value in kwargs.items(): + if value is None: + self.param_map[key] = key + continue + if not isinstance(value, str): + raise TypeError(f"in {key}={value}, value must be `str`, not `{type(value)}`.") + self.param_map[key] = value + value_counter[value].add(key) + for value, key_set in value_counter.items(): + if len(key_set) > 1: + raise ValueError(f"Several parameters:{key_set} are provided with one output {value}.") + + # check consistence between signature and param_map + func_spect = inspect.getfullargspec(self.get_loss) + func_args = [arg for arg in func_spect.args if arg != 'self'] + for func_param, input_param in self.param_map.items(): + if func_param not in func_args: + raise NameError( + f"Parameter `{func_param}` is not in {get_func_signature(self.get_loss)}. Please check the " + f"initialization parameters, or change the signature of" + f" {get_func_signature(self.get_loss)}.") + + # evaluate should not have varargs. + if func_spect.varargs: + raise NameError(f"Delete `*{func_spect.varargs}` in {get_func_signature(self.get_loss)}(Do not use " + f"positional argument.).") + def __call__(self, output_dict, target_dict, force_check=False): """ :param output_dict: A dict from forward function of the network. @@ -106,6 +157,13 @@ def __init__(self, func, key_map=None, **kwargs): self.get_loss = func +class CrossEntropyLoss(LossBase): + def __init__(self, input=None, target=None): + super(CrossEntropyLoss, self).__init__() + self.get_loss = F.cross_entropy + self._init_param_map(input=input, target=target) + + class L1Loss(LossBase): def __init__(self): super(L1Loss, self).__init__() @@ -116,6 +174,7 @@ class BCELoss(LossBase): def __init__(self, input=None, target=None): super(BCELoss, self).__init__() self.get_loss = F.binary_cross_entropy + self._init_param_map(input=input, target=target) class NLLLoss(LossBase): @@ -287,11 +346,12 @@ def make_mask(lens, tar_len): class Loss(object): - '''a Loss object is a callable object represents loss functions - ''' + """a Loss object is a callable object represents loss functions + + """ def __init__(self, loss_name, pre_pro=[squash], **kwargs): - ''' + """ :param loss_name: str or None , the name of loss function :param pre_pro : list of function or str, methods to reform parameters before calculating loss @@ -303,7 +363,7 @@ def __init__(self, loss_name, pre_pro=[squash], **kwargs): kwargs is the extra parameters passed-in when calling loss function pre_pro functions should return two objects, respectively predict and truth that after processed - ''' + """ if loss_name is None: # this is useful when Trainer.__init__ performs type check diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index f823cc52..bc688e9c 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -15,16 +15,15 @@ class MetricBase(object): def __init__(self): - self.param_map = {} # key is param in function, value is input param. + self.param_map = {} # key is param in function, value is input param. self._checked = False def evaluate(self, *args, **kwargs): raise NotImplementedError def _init_param_map(self, key_map=None, **kwargs): - """ + """Check the validity of key_map and other param map. Add these into self.param_map - check the validity of key_map and other param map. Add these into self.param_map :param key_map: dict :param kwargs: :return: None @@ -37,9 +36,9 @@ def _init_param_map(self, key_map=None, **kwargs): if value is None: self.param_map[key] = key continue - if isinstance(key, str): + if not isinstance(key, str): raise TypeError(f"key in key_map must be `str`, not `{type(key)}`.") - if isinstance(value, str): + if not isinstance(value, str): raise TypeError(f"value in key_map must be `str`, not `{type(value)}`.") self.param_map[key] = value value_counter[value].add(key) @@ -52,12 +51,12 @@ def _init_param_map(self, key_map=None, **kwargs): self.param_map[key] = value value_counter[value].add(key) for value, key_set in value_counter.items(): - if len(key_set)>1: + if len(key_set) > 1: raise ValueError(f"Several parameters:{key_set} are provided with one output {value}.") # check consistence between signature and param_map func_spect = inspect.getfullargspec(self.evaluate) - func_args = [arg for arg in func_spect.args if arg!='self'] + func_args = [arg for arg in func_spect.args if arg != 'self'] for func_param, input_param in self.param_map.items(): if func_param not in func_args: raise NameError(f"Parameter `{func_param}` is not in {get_func_signature(self.evaluate)}. Please check the " @@ -76,7 +75,7 @@ def __call__(self, pred_dict, target_dict, check=False): """ This method will call self.evaluate method. - Before calling self.evaluate, it will first check the validity ofoutput_dict, target_dict + Before calling self.evaluate, it will first check the validity of output_dict, target_dict (1) whether self.evaluate has varargs, which is not supported. (2) whether params needed by self.evaluate is not included in output_dict,target_dict. (3) whether params needed by self.evaluate duplicate in output_dict, target_dict diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py index 09274d2d..8a9f0cc1 100644 --- a/fastNLP/models/base_model.py +++ b/fastNLP/models/base_model.py @@ -1,5 +1,7 @@ import torch +from fastNLP.modules.decoder.MLP import MLP + class BaseModel(torch.nn.Module): """Base PyTorch model for all models. @@ -9,20 +11,20 @@ def __init__(self): super(BaseModel, self).__init__() def fit(self, train_data, dev_data=None, **train_args): - raise NotImplementedError + pass def predict(self, *args, **kwargs): raise NotImplementedError -class LinearClassifier(BaseModel): +class NaiveClassifier(BaseModel): def __init__(self, in_feature_dim, out_feature_dim): - super(LinearClassifier, self).__init__() - self.linear = torch.nn.Linear(in_feature_dim, out_feature_dim) - self.softmax = torch.nn.Softmax() + super(NaiveClassifier, self).__init__() + self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim, out_feature_dim]) + self.softmax = torch.nn.Softmax(dim=0) def forward(self, x): - return {"predict": self.softmax(self.linear(x))} + return {"predict": self.softmax(self.mlp(x))} def predict(self, x): - return {"predict": self.softmax(self.linear(x))} + return {"predict": self.softmax(self.mlp(x))} diff --git a/test/core/test_loss.py b/test/core/test_loss.py index edff342d..1124860b 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -1,370 +1,310 @@ import math import unittest +import torch import torch as tc import torch.nn.functional as F import fastNLP.core.losses as loss +from fastNLP.core.losses import LossFunc class TestLoss(unittest.TestCase): - def test_case_1(self): - #验证nllloss的原理 - - print (".----------------------------------") - - # loss_func = loss.Loss("nll") - print(callable(tc.nn.NLLLoss)) - - loss_func = loss.LossFunc(F.nll_loss) - - nll_loss = loss.NLLLoss() - - #pdb.set_trace() - - y = tc.Tensor( - [ - [.3,.4,.3], - [.5,.3,.2], - [.3,.6,.1], - ] - ) - - gy = tc.LongTensor( - [ - 0, - 1, - 2, - ] - ) - - - y = tc.log(y) - los = loss_func({'input': y}, {'target': gy}) - losses = nll_loss({'input': y}, {'target': gy}) - - r = -math.log(.3) - math.log(.3) - math.log(.1) - r /= 3 - print ("loss = %f" % (los)) - print ("r = %f" % (r)) - print ("nll_loss = %f" % (losses)) - - self.assertEqual(int(los * 1000), int(r * 1000)) - - def _test_case_2(self): - #验证squash()的正确性 - print ("----------------------------------") - - log = math.log - - loss_func = loss.Loss("nll") - - #pdb.set_trace() - - y = tc.Tensor( - [ - [[.3,.4,.3],[.3,.4,.3],], - [[.5,.3,.2],[.1,.2,.7],], - [[.3,.6,.1],[.2,.1,.7],], - ] - ) - - gy = tc.LongTensor( - [ - [0,2], - [1,2], - [2,1], - ] - ) - - - #pdb.set_trace() - - y = tc.log(y) - #los = loss_func({'input': y}, {'target': gy}) - los = loss_func(y, gy) - print ("loss = %f" % (los)) - - r = -log(.3) - log(.3) - log(.1) - log(.3) - log(.7) - log(.1) - r /= 6 - print ("r = %f" % (r)) - - self.assertEqual(int(los * 1000), int(r * 1000)) - - def test_case_3(self): - #验证pack_padded_sequence()的正确性 - print ("----------------------------------") - - log = math.log - - #loss_func = loss.Loss("nll") - loss_func = loss.NLLLoss() - - #pdb.set_trace() - - y = tc.Tensor( - [ - [[.3,.4,.3],[.3,.2,.5],[.4,.5,.1,],], - [[.5,.3,.2],[.1,.2,.7],[.0,.0,.0,],], - [[.3,.6,.1],[.0,.0,.0],[.0,.0,.0,],], - ] - ) - - gy = tc.LongTensor( - [ - [0,2,1,], - [1,2,0,], - [2,0,0,], - ] - ) - - lens = [3,2,1] - - #pdb.set_trace() - - y = tc.log(y) - - yy = tc.nn.utils.rnn.pack_padded_sequence(y , lens , batch_first = True).data - gyy = tc.nn.utils.rnn.pack_padded_sequence(gy , lens , batch_first = True).data - los = loss_func({'input': yy}, {'target': gyy}) - print ("loss = %f" % (los)) - - - r = -log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1) - r /= 6 - print ("r = %f" % (r)) - - self.assertEqual(int(los * 1000), int(r * 1000)) - - def test_case_4(self): - #验证unpad()的正确性 - print ("----------------------------------") - - log = math.log - - #pdb.set_trace() - - y = tc.Tensor( - [ - [[.3,.4,.3],[.3,.2,.5],[.4,.5,.1,],[.6,.3,.1,],], - [[.5,.3,.2],[.1,.2,.7],[.0,.0,.0,],[.0,.0,.0,],], - [[.3,.6,.1],[.0,.0,.0],[.0,.0,.0,],[.0,.0,.0,],], - ] - ) - - gy = tc.LongTensor( - [ - [0,2,1,2,], - [1,2,0,0,], - [2,0,0,0,], - ] - ) - - lens = [4,2,1] - - #pdb.set_trace() - - y = tc.log(y) - - loss_func = loss.Loss("nll" , pre_pro = ["unpad"]) - los = loss_func(y , gy , lens = lens) - print ("loss = %f" % (los)) - - - r = -log(.1) -log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1) - r /= 7 - print ("r = %f" % (r)) - - - self.assertEqual(int(los * 1000), int(r * 1000)) - - def test_case_5(self): - #验证mask()和make_mask()的正确性 - print ("----------------------------------") - - log = math.log - - #pdb.set_trace() - - y = tc.Tensor( - [ - [[.5,.3,.2],[.1,.2,.7],[.0,.0,.0,],[.0,.0,.0,],], - [[.5,.4,.1],[.3,.2,.5],[.4,.5,.1,],[.6,.1,.3,],], - [[.3,.6,.1],[.3,.2,.5],[.0,.0,.0,],[.0,.0,.0,],], - ] - ) - - gy = tc.LongTensor( - [ - [1,2,0,0,], - [0,2,1,2,], - [2,1,0,0,], - ] - ) - - mask = tc.ByteTensor( - [ - [1,1,0,0,], - [1,1,1,1,], - [1,1,0,0,], - ] - ) - - y = tc.log(y) - - lens = [2,4,2] - - loss_func = loss.Loss("nll" , pre_pro = ["mask"]) - los = loss_func(y , gy , mask = mask) - print ("loss = %f" % (los)) - - los2 = loss_func(y , gy , mask = loss.make_mask(lens,gy.size()[-1])) - print ("loss2 = %f" % (los2)) - - - r = -log(.3) -log(.7) - log(.5) - log(.5) - log(.5) - log(.3) - log(.1) - log(.2) - r /= 8 - print ("r = %f" % (r)) - - - self.assertEqual(int(los * 1000), int(r * 1000)) - self.assertEqual(int(los2 * 1000), int(r * 1000)) - - def test_case_6(self): - #验证unpad_mask()的正确性 - print ("----------------------------------") - - log = math.log - - #pdb.set_trace() - - y = tc.Tensor( - [ - [[.3,.4,.3],[.3,.2,.5],[.4,.5,.1,],[.6,.3,.1,],], - [[.5,.3,.2],[.1,.2,.7],[.0,.0,.0,],[.0,.0,.0,],], - [[.3,.6,.1],[.0,.0,.0],[.0,.0,.0,],[.0,.0,.0,],], - ] - ) - - gy = tc.LongTensor( - [ - [0,2,1,2,], - [1,2,0,0,], - [2,0,0,0,], - ] - ) - - lens = [4,2,1] - - #pdb.set_trace() - - y = tc.log(y) - - loss_func = loss.Loss("nll" , pre_pro = ["unpad_mask"]) - los = loss_func(y , gy , lens = lens) - print ("loss = %f" % (los)) - - - r = -log(.1) -log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1) - r /= 7 - print ("r = %f" % (r)) - - self.assertEqual(int(los * 1000), int(r * 1000)) - - def test_case_7(self): - #验证一些其他东西 - print ("----------------------------------") - - log = math.log - - #pdb.set_trace() - - y = tc.Tensor( - [ - [[.3,.4,.3],[.3,.2,.5],[.4,.5,.1,],[.6,.3,.1,],], - [[.5,.3,.2],[.1,.2,.7],[.0,.0,.0,],[.0,.0,.0,],], - [[.3,.6,.1],[.0,.0,.0],[.0,.0,.0,],[.0,.0,.0,],], - ] - ) - - gy = tc.LongTensor( - [ - [0,2,1,2,], - [1,2,0,0,], - [2,0,0,0,], - ] - ) - - lens = [4,2,1] - - #pdb.set_trace() - - y = tc.log(y) - - loss_func = loss.Loss("nll" , pre_pro = [] , weight = tc.Tensor([1,1,0])) - loss_func.add_pre_pro("unpad_mask") - los = loss_func(y , gy , lens = lens) - print ("loss = %f" % (los)) - - - r = - log(.3) - log(.5) - log(.3) - r /= 3 - print ("r = %f" % (r)) - self.assertEqual(int(los * 1000), int(r * 1000)) - - def test_case_8(self): - def func(a, b): - import torch.nn.functional as F - return F.cross_entropy(a, b) - - def func2(a, truth): - return func(a, truth) - - def func3(predict, truth): - return func(predict, truth) - - def func4(a, b, c=2): - return (a + b) * c - - def func6(a, b, **kwargs): - c = kwargs['c'] - return (a + b) * c - - -from fastNLP.core.losses import LossFunc - -get_loss = LossFunc(func, {'a': 'predict', 'b': 'truth'}) - predict = torch.randn(5, 3) - truth = torch.LongTensor([1, 0, 1, 2, 1]) - loss1 = get_loss({'predict': predict}, {'truth': truth}) -get_loss_2 = LossFunc(func2, {'a': 'predict'}) - loss2 = get_loss_2({'predict': predict}, {'truth': truth}) -get_loss_3 = LossFunc(func3) - loss3 = get_loss_3({'predict': predict}, {'truth': truth}) - print(loss1, loss2, loss3) - assert loss1 == loss2 and loss1 == loss3 - -get_loss_4 = LossFunc(func4) - loss4 = get_loss_4({'a': 1, 'b': 3}, {}) - print(loss4) - assert loss4 == (1 + 3) * 2 - -get_loss_5 = LossFunc(func4) - loss5 = get_loss_5({'a': 1, 'b': 3}, {'c': 4}) - print(loss5) - assert loss5 == (1 + 3) * 4 - -get_loss_6 = LossFunc(func6) - loss6 = get_loss_6({'a': 1, 'b': 3}, {'c': 4}) - print(loss6) - assert loss6 == (1 + 3) * 4 - -get_loss_7 = LossFunc(func6, c='cc') - loss7 = get_loss_7({'a': 1, 'b': 3}, {'cc': 4}) - print(loss7) - assert loss7 == (1 + 3) * 4 - - -if __name__ == "__main__": - unittest.main() + def test_case_1(self): + loss_func = loss.LossFunc(F.nll_loss) + nll_loss = loss.NLLLoss() + y = tc.Tensor( + [ + [.3, .4, .3], + [.5, .3, .2], + [.3, .6, .1], + ] + ) + + gy = tc.LongTensor( + [ + 0, + 1, + 2, + ] + ) + + y = tc.log(y) + los = loss_func({'input': y}, {'target': gy}) + losses = nll_loss({'input': y}, {'target': gy}) + + r = -math.log(.3) - math.log(.3) - math.log(.1) + r /= 3 + print("loss = %f" % (los)) + print("r = %f" % (r)) + print("nll_loss = %f" % (losses)) + + self.assertEqual(int(los * 1000), int(r * 1000)) + + def test_case_2(self): + # 验证squash()的正确性 + + log = math.log + loss_func = loss.Loss("nll") + + y = tc.Tensor( + [ + [[.3, .4, .3], [.3, .4, .3], ], + [[.5, .3, .2], [.1, .2, .7], ], + [[.3, .6, .1], [.2, .1, .7], ], + ] + ) + + gy = tc.LongTensor( + [ + [0, 2], + [1, 2], + [2, 1], + ] + ) + + y = tc.log(y) + # los = loss_func({'input': y}, {'target': gy}) + los = loss_func(y, gy) + + r = -log(.3) - log(.3) - log(.1) - log(.3) - log(.7) - log(.1) + r /= 6 + + self.assertEqual(int(los * 1000), int(r * 1000)) + + def test_case_3(self): + # 验证pack_padded_sequence()的正确性 + log = math.log + loss_func = loss.NLLLoss() + y = tc.Tensor( + [ + [[.3, .4, .3], [.3, .2, .5], [.4, .5, .1, ], ], + [[.5, .3, .2], [.1, .2, .7], [.0, .0, .0, ], ], + [[.3, .6, .1], [.0, .0, .0], [.0, .0, .0, ], ], + ] + ) + + gy = tc.LongTensor( + [ + [0, 2, 1, ], + [1, 2, 0, ], + [2, 0, 0, ], + ] + ) + + lens = [3, 2, 1] + + # pdb.set_trace() + + y = tc.log(y) + + yy = tc.nn.utils.rnn.pack_padded_sequence(y, lens, batch_first=True).data + gyy = tc.nn.utils.rnn.pack_padded_sequence(gy, lens, batch_first=True).data + los = loss_func({'input': yy}, {'target': gyy}) + + r = -log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1) + r /= 6 + + self.assertEqual(int(los * 1000), int(r * 1000)) + + def test_case_4(self): + # 验证unpad()的正确性 + log = math.log + y = tc.Tensor( + [ + [[.3, .4, .3], [.3, .2, .5], [.4, .5, .1, ], [.6, .3, .1, ], ], + [[.5, .3, .2], [.1, .2, .7], [.0, .0, .0, ], [.0, .0, .0, ], ], + [[.3, .6, .1], [.0, .0, .0], [.0, .0, .0, ], [.0, .0, .0, ], ], + ] + ) + + gy = tc.LongTensor( + [ + [0, 2, 1, 2, ], + [1, 2, 0, 0, ], + [2, 0, 0, 0, ], + ] + ) + + lens = [4, 2, 1] + y = tc.log(y) + + loss_func = loss.Loss("nll", pre_pro=["unpad"]) + los = loss_func(y, gy, lens=lens) + + r = -log(.1) - log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1) + r /= 7 + + self.assertEqual(int(los * 1000), int(r * 1000)) + + def test_case_5(self): + # 验证mask()和make_mask()的正确性 + log = math.log + + y = tc.Tensor( + [ + [[.5, .3, .2], [.1, .2, .7], [.0, .0, .0, ], [.0, .0, .0, ], ], + [[.5, .4, .1], [.3, .2, .5], [.4, .5, .1, ], [.6, .1, .3, ], ], + [[.3, .6, .1], [.3, .2, .5], [.0, .0, .0, ], [.0, .0, .0, ], ], + ] + ) + + gy = tc.LongTensor( + [ + [1, 2, 0, 0, ], + [0, 2, 1, 2, ], + [2, 1, 0, 0, ], + ] + ) + + mask = tc.ByteTensor( + [ + [1, 1, 0, 0, ], + [1, 1, 1, 1, ], + [1, 1, 0, 0, ], + ] + ) + + y = tc.log(y) + + lens = [2, 4, 2] + + loss_func = loss.Loss("nll", pre_pro=["mask"]) + los = loss_func(y, gy, mask=mask) + + los2 = loss_func(y, gy, mask=loss.make_mask(lens, gy.size()[-1])) + + r = -log(.3) - log(.7) - log(.5) - log(.5) - log(.5) - log(.3) - log(.1) - log(.2) + r /= 8 + + self.assertEqual(int(los * 1000), int(r * 1000)) + self.assertEqual(int(los2 * 1000), int(r * 1000)) + + def test_case_6(self): + # 验证unpad_mask()的正确性 + log = math.log + y = tc.Tensor( + [ + [[.3, .4, .3], [.3, .2, .5], [.4, .5, .1, ], [.6, .3, .1, ], ], + [[.5, .3, .2], [.1, .2, .7], [.0, .0, .0, ], [.0, .0, .0, ], ], + [[.3, .6, .1], [.0, .0, .0], [.0, .0, .0, ], [.0, .0, .0, ], ], + ] + ) + + gy = tc.LongTensor( + [ + [0, 2, 1, 2, ], + [1, 2, 0, 0, ], + [2, 0, 0, 0, ], + ] + ) + + lens = [4, 2, 1] + + # pdb.set_trace() + + y = tc.log(y) + + loss_func = loss.Loss("nll", pre_pro=["unpad_mask"]) + los = loss_func(y, gy, lens=lens) + + r = -log(.1) - log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1) + r /= 7 + + self.assertEqual(int(los * 1000), int(r * 1000)) + + def test_case_7(self): + # 验证一些其他东西 + log = math.log + y = tc.Tensor( + [ + [[.3, .4, .3], [.3, .2, .5], [.4, .5, .1, ], [.6, .3, .1, ], ], + [[.5, .3, .2], [.1, .2, .7], [.0, .0, .0, ], [.0, .0, .0, ], ], + [[.3, .6, .1], [.0, .0, .0], [.0, .0, .0, ], [.0, .0, .0, ], ], + ] + ) + + gy = tc.LongTensor( + [ + [0, 2, 1, 2, ], + [1, 2, 0, 0, ], + [2, 0, 0, 0, ], + ] + ) + + lens = [4, 2, 1] + y = tc.log(y) + + loss_func = loss.Loss("nll", pre_pro=[], weight=tc.Tensor([1, 1, 0])) + loss_func.add_pre_pro("unpad_mask") + los = loss_func(y, gy, lens=lens) + + r = - log(.3) - log(.5) - log(.3) + r /= 3 + self.assertEqual(int(los * 1000), int(r * 1000)) + + def test_case_8(self): + def func(a, b): + return F.cross_entropy(a, b) + + def func2(a, truth): + return func(a, truth) + + def func3(predict, truth): + return func(predict, truth) + + def func4(a, b, c=2): + return (a + b) * c + + def func6(a, b, **kwargs): + c = kwargs['c'] + return (a + b) * c + + get_loss = LossFunc(func, {'a': 'predict', 'b': 'truth'}) + predict = torch.randn(5, 3) + truth = torch.LongTensor([1, 0, 1, 2, 1]) + loss1 = get_loss({'predict': predict}, {'truth': truth}) + get_loss_2 = LossFunc(func2, {'a': 'predict'}) + loss2 = get_loss_2({'predict': predict}, {'truth': truth}) + get_loss_3 = LossFunc(func3) + loss3 = get_loss_3({'predict': predict}, {'truth': truth}) + assert loss1 == loss2 and loss1 == loss3 + + """ + get_loss_4 = LossFunc(func4) + loss4 = get_loss_4({'a': 1, 'b': 3}, {}) + print(loss4) + assert loss4 == (1 + 3) * 2 + + get_loss_5 = LossFunc(func4) + loss5 = get_loss_5({'a': 1, 'b': 3}, {'c': 4}) + print(loss5) + assert loss5 == (1 + 3) * 4 + + get_loss_6 = LossFunc(func6) + loss6 = get_loss_6({'a': 1, 'b': 3}, {'c': 4}) + print(loss6) + assert loss6 == (1 + 3) * 4 + + get_loss_7 = LossFunc(func6, c='cc') + loss7 = get_loss_7({'a': 1, 'b': 3}, {'cc': 4}) + print(loss7) + assert loss7 == (1 + 3) * 4 + """ + + +class TestLoss_v2(unittest.TestCase): + def test_CrossEntropyLoss(self): + ce = loss.CrossEntropyLoss(input="my_predict", target="my_truth") + a = torch.randn(3, 5, requires_grad=False) + b = torch.empty(3, dtype=torch.long).random_(5) + ans = ce({"my_predict": a}, {"my_truth": b}) + self.assertEqual(ans, torch.nn.functional.cross_entropy(a, b)) + + def test_BCELoss(self): + bce = loss.BCELoss(input="my_predict", target="my_truth") + a = torch.sigmoid(torch.randn((3, 5), requires_grad=False)) + b = torch.randn((3, 5), requires_grad=False) + ans = bce({"my_predict": a}, {"my_truth": b}) + self.assertEqual(ans, torch.nn.functional.binary_cross_entropy(a, b)) diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index 0194d254..3b0e2b71 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -1,24 +1,23 @@ import unittest import numpy as np -import torch from fastNLP.core.dataset import DataSet from fastNLP.core.instance import Instance -from fastNLP.core.losses import LossFunc +from fastNLP.core.losses import BCELoss from fastNLP.core.metrics import AccuracyMetric from fastNLP.core.optimizer import SGD from fastNLP.core.trainer import Trainer -from fastNLP.models.base_model import LinearClassifier +from fastNLP.models.base_model import NaiveClassifier class TrainerTestGround(unittest.TestCase): def test_case(self): - mean = np.array([-3, -3]) + mean = np.array([-5, -5]) cov = np.array([[1, 0], [0, 1]]) class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) - mean = np.array([3, 3]) + mean = np.array([5, 5]) cov = np.array([[1, 0], [0, 1]]) class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) @@ -30,11 +29,10 @@ def test_case(self): train_set, dev_set = data_set.split(0.3) - model = LinearClassifier(2, 1) + model = NaiveClassifier(2, 1) trainer = Trainer(train_set, model, - losser=LossFunc(torch.nn.functional.binary_cross_entropy, - key_map={"target": "y", "input": "predict"}), + losser=BCELoss(input="predict", target="y"), metrics=AccuracyMetric(pred="predict", target="y"), n_epochs=10, batch_size=32, From 11c82ab2e781d4ecdae8be29f97706b8c5eb4d43 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sun, 2 Dec 2018 20:07:50 +0800 Subject: [PATCH 32/67] =?UTF-8?q?=E8=B7=91=E9=80=9Atest=5Ftrainer.py?= =?UTF-8?q?=EF=BC=8C=E8=81=94=E8=B0=83=E7=BB=93=E6=9D=9F=EF=BC=8C=E5=87=86?= =?UTF-8?q?=E5=A4=87=E5=8F=91=E5=B8=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/models/base_model.py | 7 +++---- test/core/test_trainer.py | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py index 8a9f0cc1..ec532014 100644 --- a/fastNLP/models/base_model.py +++ b/fastNLP/models/base_model.py @@ -20,11 +20,10 @@ def predict(self, *args, **kwargs): class NaiveClassifier(BaseModel): def __init__(self, in_feature_dim, out_feature_dim): super(NaiveClassifier, self).__init__() - self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim, out_feature_dim]) - self.softmax = torch.nn.Softmax(dim=0) + self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) def forward(self, x): - return {"predict": self.softmax(self.mlp(x))} + return {"predict": torch.sigmoid(self.mlp(x))} def predict(self, x): - return {"predict": self.softmax(self.mlp(x))} + return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index 3b0e2b71..ee4a5770 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -13,11 +13,11 @@ class TrainerTestGround(unittest.TestCase): def test_case(self): - mean = np.array([-5, -5]) + mean = np.array([-3, -3]) cov = np.array([[1, 0], [0, 1]]) class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) - mean = np.array([5, 5]) + mean = np.array([3, 3]) cov = np.array([[1, 0], [0, 1]]) class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) @@ -39,7 +39,7 @@ def test_case(self): print_every=10, validate_every=-1, dev_data=dev_set, - optimizer=SGD(0.001), + optimizer=SGD(0.1), check_code_level=2 ) trainer.train() From d19850b397de5ce644d77c7deaf62e9c48e6b037 Mon Sep 17 00:00:00 2001 From: yh Date: Sun, 2 Dec 2018 23:27:40 +0800 Subject: [PATCH 33/67] * add _fast_call_evaluate mechanism in MetricBase --- fastNLP/core/metrics.py | 69 ++++++++++++++++++++++++++++++++------- test/core/test_metrics.py | 36 +++++++++++++------- 2 files changed, 81 insertions(+), 24 deletions(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index f823cc52..6401d731 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -11,7 +11,7 @@ from fastNLP.core.utils import _check_arg_dict_list from fastNLP.core.utils import get_func_signature from fastNLP.core.utils import seq_lens_to_masks - +from fastNLP.core.utils import CheckRes class MetricBase(object): def __init__(self): @@ -72,6 +72,17 @@ def _init_param_map(self, key_map=None, **kwargs): def get_metric(self, reset=True): raise NotImplemented + def _fast_call_evaluate(self, pred_dict, target_dict): + """ + + Only used as inner function. When the pred_dict, target is unequivocal. Don't need users to pass key_map. + such as pred_dict has one element, target_dict has one element + :param pred_dict: + :param target_dict: + :return: boolean, whether to go on codes in self.__call__(). When False, don't go on. + """ + return False + def __call__(self, pred_dict, target_dict, check=False): """ @@ -79,7 +90,7 @@ def __call__(self, pred_dict, target_dict, check=False): Before calling self.evaluate, it will first check the validity ofoutput_dict, target_dict (1) whether self.evaluate has varargs, which is not supported. (2) whether params needed by self.evaluate is not included in output_dict,target_dict. - (3) whether params needed by self.evaluate duplicate in output_dict, target_dict + (3) whether params needed by self.evaluate duplicate in pred_dict, target_dict (4) whether params in output_dict, target_dict are not used by evaluate.(Might cause warning) Besides, before passing params into self.evaluate, this function will filter out params from output_dict and target_dict which are not used in self.evaluate. (but if **kwargs presented in self.evaluate, no filtering @@ -92,6 +103,10 @@ def __call__(self, pred_dict, target_dict, check=False): if not callable(self.evaluate): raise TypeError(f"{self.__class__.__name__}.evaluate has to be callable, not {type(self.evaluate)}.") + if not check: + if self._fast_call_evaluate(pred_dict=pred_dict, target_dict=target_dict): + return + if not self._checked: # 1. check consistence between signature and param_map func_spect = inspect.getfullargspec(self.evaluate) @@ -110,28 +125,40 @@ def __call__(self, pred_dict, target_dict, check=False): # need to wrap inputs in dict. mapped_pred_dict = {} mapped_target_dict = {} + duplicated = [] for input_arg in set(list(pred_dict.keys()) + list(target_dict.keys())): + not_duplicate_flag = 0 if input_arg in self._reverse_param_map: mapped_arg = self._reverse_param_map[input_arg] + not_duplicate_flag += 1 else: mapped_arg = input_arg if input_arg in pred_dict: mapped_pred_dict[mapped_arg] = pred_dict[input_arg] + not_duplicate_flag += 1 if input_arg in target_dict: mapped_target_dict[mapped_arg] = target_dict[input_arg] + not_duplicate_flag += 1 + if not_duplicate_flag == 3: + duplicated.append(input_arg) - # check duplicated, unused, missing + # missing if check or not self._checked: check_res = _check_arg_dict_list(self.evaluate, [mapped_pred_dict, mapped_target_dict]) - for key in check_res._fields: - value = getattr(check_res, key) - new_value = list(value) - # TODO 这里报错的逻辑应该是怎样的? - for idx, func_arg in enumerate(value): - if func_arg in self.param_map: - new_value[idx] = self.param_map[func_arg] + f'(try to get value from {self.param_map[func_arg]})' - else: - new_value[idx] = func_arg + # only check missing. + missing = check_res.missing + replaced_missing = list(missing) + for idx, func_arg in enumerate(missing): + replaced_missing[idx] = f"`{self.param_map[func_arg]}`" + f"(assign to `{func_arg}` " \ + f"in `{get_func_signature(self.evaluate)}`)" + + check_res = CheckRes(missing=replaced_missing, + unused=check_res.unused, + duplicated=duplicated, + required=check_res.required, + all_needed=check_res.all_needed, + varargs=check_res.varargs) + if check_res.missing or check_res.duplicated or check_res.varargs: raise CheckError(check_res=check_res, func_signature=get_func_signature(self.evaluate)) @@ -140,6 +167,7 @@ def __call__(self, pred_dict, target_dict, check=False): self.evaluate(**refined_args) self._checked = True + return class AccuracyMetric(MetricBase): def __init__(self, pred=None, target=None, masks=None, seq_lens=None): @@ -151,6 +179,22 @@ def __init__(self, pred=None, target=None, masks=None, seq_lens=None): self.total = 0 self.acc_count = 0 + def _fast_call_evaluate(self, pred_dict, target_dict): + """ + + Only used as inner function. When the pred_dict, target is unequivocal. Don't need users to pass key_map. + such as pred_dict has one element, target_dict has one element + :param pred_dict: + :param target_dict: + :return: boolean, whether to go on codes in self.__call__(). When False, don't go on. + """ + if len(pred_dict)==1 and len(target_dict)==1: + pred = list(pred_dict.values())[0] + target = list(target_dict.values())[0] + self.evaluate(pred=pred, target=target) + return True + return False + def evaluate(self, pred, target, masks=None, seq_lens=None): """ @@ -164,6 +208,7 @@ def evaluate(self, pred, target, masks=None, seq_lens=None): None, None, torch.Size([B], torch.Size([B]). ignored if masks are provided. :return: dict({'acc': float}) """ + #TODO 这里报错需要更改,因为pred是啥用户并不知道。需要告知用户真实的value if not isinstance(pred, torch.Tensor): raise TypeError(f"`pred` in {get_func_signature(self.evaluate)} must be torch.Tensor," f"got {type(pred)}.") diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py index c6a8523e..ffc11401 100644 --- a/test/core/test_metrics.py +++ b/test/core/test_metrics.py @@ -12,7 +12,7 @@ class TestAccuracyMetric(unittest.TestCase): # target_dict = {'target': torch.zeros(4)} # metric = AccuracyMetric() # - # metric(pred_dict=pred_dict, target_dict=target_dict) + # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) # print(metric.get_metric()) # # def test_AccuracyMetric2(self): @@ -22,7 +22,7 @@ class TestAccuracyMetric(unittest.TestCase): # target_dict = {'target': torch.zeros(4)} # metric = AccuracyMetric() # - # metric(pred_dict=pred_dict, target_dict=target_dict) + # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) # print(metric.get_metric()) # except Exception as e: # print(e) @@ -35,11 +35,11 @@ class TestAccuracyMetric(unittest.TestCase): # metric = AccuracyMetric() # pred_dict = {"pred": torch.zeros(4, 3, 2)} # target_dict = {'target': torch.zeros(4, 3)} - # metric(pred_dict=pred_dict, target_dict=target_dict) + # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) # # pred_dict = {"pred": torch.zeros(4, 3, 2)} # target_dict = {'target': torch.zeros(4)} - # metric(pred_dict=pred_dict, target_dict=target_dict) + # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) # # print(metric.get_metric()) # except Exception as e: @@ -76,7 +76,7 @@ class TestAccuracyMetric(unittest.TestCase): # # pred_dict = {"pred": torch.zeros(4, 3, 2)} # target_dict = {'target': torch.zeros(4, 3)+1} - # metric(pred_dict=pred_dict, target_dict=target_dict) + # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) # self.assertDictEqual(metric.get_metric(), {'acc':0}) # # def test_AccuaryMetric6(self): @@ -85,7 +85,7 @@ class TestAccuracyMetric(unittest.TestCase): # metric = AccuracyMetric() # pred_dict = {"pred": np.zeros((4, 3, 2))} # target_dict = {'target': np.zeros((4, 3))} - # metric(pred_dict=pred_dict, target_dict=target_dict) + # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) # self.assertDictEqual(metric.get_metric(), {'acc': 1}) # except Exception as e: # print(e) @@ -97,7 +97,7 @@ class TestAccuracyMetric(unittest.TestCase): # metric = AccuracyMetric(pred='predictions', target='targets') # pred_dict = {"predictions": torch.zeros(4, 3, 2)} # target_dict = {'targets': torch.zeros(4, 3)} - # metric(pred_dict=pred_dict, target_dict=target_dict) + # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) # self.assertDictEqual(metric.get_metric(), {'acc': 1}) # # def test_AccuaryMetric8(self): @@ -106,6 +106,19 @@ class TestAccuracyMetric(unittest.TestCase): # metric = AccuracyMetric(pred='predictions', target='targets') # pred_dict = {"prediction": torch.zeros(4, 3, 2)} # target_dict = {'targets': torch.zeros(4, 3)} + # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) + # self.assertDictEqual(metric.get_metric(), {'acc': 1}) + # except Exception as e: + # print(e) + # return + # self.assertTrue(True, False), "No exception catches." + + # def test_AccuaryMetric9(self): + # # (9) check map, include unused + # try: + # metric = AccuracyMetric(pred='predictions', target='targets') + # pred_dict = {"prediction": torch.zeros(4, 3, 2), 'unused':1} + # target_dict = {'targets': torch.zeros(4, 3)} # metric(pred_dict=pred_dict, target_dict=target_dict) # self.assertDictEqual(metric.get_metric(), {'acc': 1}) # except Exception as e: @@ -113,11 +126,11 @@ class TestAccuracyMetric(unittest.TestCase): # return # self.assertTrue(True, False), "No exception catches." - def test_AccuaryMetric9(self): - # (9) check map, include unused + def test_AccuaryMetric10(self): + # (10) check _fast_metric try: - metric = AccuracyMetric(pred='predictions', target='targets') - pred_dict = {"prediction": torch.zeros(4, 3, 2), 'unused':1} + metric = AccuracyMetric() + pred_dict = {"predictions": torch.zeros(4, 3, 2)} target_dict = {'targets': torch.zeros(4, 3)} metric(pred_dict=pred_dict, target_dict=target_dict) self.assertDictEqual(metric.get_metric(), {'acc': 1}) @@ -125,4 +138,3 @@ def test_AccuaryMetric9(self): print(e) return self.assertTrue(True, False), "No exception catches." - From 5824b7f4c73788738baa0d39c01ec0d12bc4ba0e Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Mon, 3 Dec 2018 00:08:59 +0800 Subject: [PATCH 34/67] =?UTF-8?q?=E8=B7=91=E9=80=9Atutorial,=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D=E4=B8=80=E4=BA=9Bbugs:=20*=20dataset=E6=A3=80?= =?UTF-8?q?=E6=9F=A5slice=E5=BC=80=E5=A7=8B=E4=BD=8D=E7=BD=AE=EF=BC=8C?= =?UTF-8?q?=E7=A1=AE=E4=BF=9D=E7=BB=93=E6=9E=9C=E4=B8=8D=E4=B8=BA=E7=A9=BA?= =?UTF-8?q?=20*=20fieldarray=E6=A3=80=E6=9F=A5content=E4=B8=8D=E4=B8=BA?= =?UTF-8?q?=E7=A9=BA=20*=20optimizer=E6=8E=A5=E5=8F=97=E7=9A=84model=20par?= =?UTF-8?q?ams=E6=98=AF=E4=B8=80=E4=B8=AAgenerator=EF=BC=8C=E4=B8=8D?= =?UTF-8?q?=E8=83=BD=E8=B5=8B=E5=80=BC=20*=20code=20style=20refine?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/dataset.py | 7 +- fastNLP/core/fieldarray.py | 3 + fastNLP/core/optimizer.py | 12 ++- fastNLP/models/cnn_text_classification.py | 7 +- test/io/__init__.py | 0 test/test_tutorial.py | 95 +++++++++++++++++++++++ 6 files changed, 115 insertions(+), 9 deletions(-) delete mode 100644 test/io/__init__.py create mode 100644 test/test_tutorial.py diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 6d2a94d6..e93333a0 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -87,6 +87,8 @@ def __getitem__(self, idx): if isinstance(idx, int): return Instance(**{name: self.field_arrays[name][idx] for name in self.field_arrays}) elif isinstance(idx, slice): + if idx.start is not None and (idx.start >= len(self) or idx.start <= -len(self)): + raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self)-1}") data_set = DataSet() for field in self.field_arrays.values(): data_set.add_field(name=field.name, @@ -135,7 +137,9 @@ def add_field(self, name, fields, padding_val=0, is_input=False, is_target=False :param bool is_target: whether this field is label or target. """ if len(self.field_arrays) != 0: - assert len(self) == len(fields) + if len(self) != len(fields): + raise RuntimeError(f"The field to append must have the same size as dataset. " + f"Dataset size {len(self)} != field size {len(fields)}") self.field_arrays[name] = FieldArray(name, fields, padding_val=padding_val, is_target=is_target, is_input=is_input) @@ -168,6 +172,7 @@ def rename_field(self, old_name, new_name): """ if old_name in self.field_arrays: self.field_arrays[new_name] = self.field_arrays.pop(old_name) + self.field_arrays[new_name].name = new_name else: raise KeyError("{} is not a valid name. ".format(old_name)) diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index 976dc2c6..14c52829 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -33,7 +33,10 @@ def _type_detection(content): type_set = set([type(item) for item in content[0]]) else: # 1-D list + if len(content) == 0: + raise RuntimeError("Cannot create FieldArray with an empty list.") type_set = set([type(item) for item in content]) + if len(type_set) == 1 and any(basic_type in type_set for basic_type in (str, int, float)): return type_set.pop() elif len(type_set) == 2 and float in type_set and int in type_set: diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py index 4cb21462..5075fa02 100644 --- a/fastNLP/core/optimizer.py +++ b/fastNLP/core/optimizer.py @@ -42,8 +42,10 @@ def __init__(self, *args, **kwargs): def construct_from_pytorch(self, model_params): if self.model_params is None: - self.model_params = model_params - return torch.optim.SGD(self.model_params, **self.settings) + # careful! generator cannot be assigned. + return torch.optim.SGD(model_params, **self.settings) + else: + return torch.optim.SGD(self.model_params, **self.settings) class Adam(Optimizer): @@ -75,5 +77,7 @@ def __init__(self, *args, **kwargs): def construct_from_pytorch(self, model_params): if self.model_params is None: - self.model_params = model_params - return torch.optim.Adam(self.model_params, **self.settings) + # careful! generator cannot be assigned. + return torch.optim.Adam(model_params, **self.settings) + else: + return torch.optim.Adam(self.model_params, **self.settings) diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py index 04b76fba..9aa07e66 100644 --- a/fastNLP/models/cnn_text_classification.py +++ b/fastNLP/models/cnn_text_classification.py @@ -18,8 +18,8 @@ class CNNText(torch.nn.Module): def __init__(self, embed_num, embed_dim, num_classes, - kernel_nums=(3,4,5), - kernel_sizes=(3,4,5), + kernel_nums=(3, 4, 5), + kernel_sizes=(3, 4, 5), padding=0, dropout=0.5): super(CNNText, self).__init__() @@ -45,7 +45,7 @@ def forward(self, word_seq): x = self.conv_pool(x) # [N,L,C] -> [N,C] x = self.dropout(x) x = self.fc(x) # [N,C] -> [N, N_class] - return {'output':x} + return {'output': x} def predict(self, word_seq): """ @@ -78,4 +78,3 @@ def evaluate(self, predict, label_seq): correct = (predict == label_seq).long().sum().item() total = label_seq.size(0) return {'acc': 1.0 * correct / total} - diff --git a/test/io/__init__.py b/test/io/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/test_tutorial.py b/test/test_tutorial.py new file mode 100644 index 00000000..05338514 --- /dev/null +++ b/test/test_tutorial.py @@ -0,0 +1,95 @@ +import unittest + +from fastNLP import DataSet +from fastNLP import Instance +from fastNLP import Tester +from fastNLP import Vocabulary +from fastNLP.core.losses import CrossEntropyLoss +from fastNLP.core.metrics import AccuracyMetric +from fastNLP.models import CNNText + + +class TestTutorial(unittest.TestCase): + def test_tutorial(self): + # 从csv读取数据到DataSet + dataset = DataSet.read_csv("./data_for_tests/tutorial_sample_dataset.csv", headers=('raw_sentence', 'label'), + sep='\t') + print(len(dataset)) + print(dataset[0]) + + dataset.append(Instance(raw_sentence='fake data', label='0')) + dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') + # label转int + dataset.apply(lambda x: int(x['label']), new_field_name='label') + + # 使用空格分割句子 + def split_sent(ins): + return ins['raw_sentence'].split() + + dataset.apply(split_sent, new_field_name='words') + # 增加长度信息 + dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') + print(len(dataset)) + print(dataset[0]) + + # DataSet.drop(func)筛除数据 + dataset.drop(lambda x: x['seq_len'] <= 3) + print(len(dataset)) + + # 设置DataSet中,哪些field要转为tensor + # set target,loss或evaluate中的golden,计算loss,模型评估时使用 + dataset.set_target("label") + # set input,模型forward时使用 + dataset.set_input("words") + + # 分出测试集、训练集 + test_data, train_data = dataset.split(0.5) + print(len(test_data)) + print(len(train_data)) + + # 构建词表, Vocabulary.add(word) + vocab = Vocabulary(min_freq=2) + train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) + vocab.build_vocab() + + # index句子, Vocabulary.to_index(word) + train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') + test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') + print(test_data[0]) + + model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1) + + from fastNLP import Trainer + from copy import deepcopy + + # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 + train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致 + train_data.rename_field('label', 'label_seq') + test_data.rename_field('words', 'word_seq') + test_data.rename_field('label', 'label_seq') + + # 实例化Trainer,传入模型和数据,进行训练 + copy_model = deepcopy(model) + overfit_trainer = Trainer(model=copy_model, train_data=test_data, dev_data=test_data, + losser=CrossEntropyLoss(input="output", target="label_seq"), + metrics=AccuracyMetric(pred="predict", target="label_seq"), + save_path="./save", + batch_size=4, + n_epochs=10) + overfit_trainer.train() + + trainer = Trainer(model=model, train_data=train_data, dev_data=test_data, + losser=CrossEntropyLoss(input="output", target="label_seq"), + metrics=AccuracyMetric(pred="predict", target="label_seq"), + save_path="./save", + batch_size=4, + n_epochs=10) + trainer.train() + print('Train finished!') + + # 使用fastNLP的Tester测试脚本 + + tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"), + batch_size=4) + acc = tester.test() + print(acc) From 88949ba1da4f24e339eb2ac8df9d20e4153b1443 Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 3 Dec 2018 09:50:16 +0800 Subject: [PATCH 35/67] =?UTF-8?q?=E4=BF=AE=E6=94=B9dataset.py=E7=9A=84appl?= =?UTF-8?q?y=20signature;=20batch=E5=BD=93=E4=B8=AD=E5=A2=9E=E5=8A=A0num?= =?UTF-8?q?=5Fbatches=E5=B1=9E=E6=80=A7;=20tester=E7=9A=84format=5Feval=5F?= =?UTF-8?q?results=E4=BF=AE=E6=94=B9;=20metric=E5=A2=9E=E5=8A=A0fast=5Feva?= =?UTF-8?q?luate=5Fcall=E6=9C=BA=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/batch.py | 4 ++++ fastNLP/core/dataset.py | 7 ++++--- fastNLP/core/metrics.py | 2 +- fastNLP/core/tester.py | 12 ++++++------ fastNLP/core/trainer.py | 8 ++++---- fastNLP/core/utils.py | 4 ++-- 6 files changed, 21 insertions(+), 16 deletions(-) diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 0aca6055..2e77e3f7 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -26,6 +26,7 @@ def __init__(self, dataset, batch_size, sampler, as_numpy=False): self.as_numpy = as_numpy self.idx_list = None self.curidx = 0 + self.num_batches = len(dataset)//batch_size + int(len(dataset)%batch_size!=0) def __iter__(self): self.idx_list = self.sampler(self.dataset) @@ -56,6 +57,9 @@ def __next__(self): return batch_x, batch_y + def __len__(self): + return self.num_batches + def to_tensor(batch, dtype): if dtype in (np.int8, np.int16, np.int32, np.int64): batch = torch.LongTensor(batch) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 6d2a94d6..2a7109a3 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -168,6 +168,7 @@ def rename_field(self, old_name, new_name): """ if old_name in self.field_arrays: self.field_arrays[new_name] = self.field_arrays.pop(old_name) + self.field_arrays[new_name].name = new_name else: raise KeyError("{} is not a valid name. ".format(old_name)) @@ -213,12 +214,12 @@ def wrapper(read_cls): return wrapper - def apply(self, func, new_field_name=None): + def apply(self, func, new_field_name=None, is_input=False, is_target=False): """Apply a function to every instance of the DataSet. :param func: a function that takes an instance as input. :param str new_field_name: If not None, results of the function will be stored as a new field. - :return results: returned values of the function over all instances. + :return results: if new_field_name is not passed, returned values of the function over all instances. """ results = [func(ins) for ins in self] if new_field_name is not None: @@ -231,7 +232,7 @@ def apply(self, func, new_field_name=None): is_input=old_field.is_input, is_target=old_field.is_target) else: - self.add_field(name=new_field_name, fields=results) + self.add_field(name=new_field_name, fields=results, is_input=is_input, is_target=is_target) else: return results diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 8ec2f7af..070b1d17 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -245,7 +245,7 @@ def evaluate(self, pred, target, masks=None, seq_lens=None): self.total += np.prod(list(pred.size())) def get_metric(self, reset=True): - evaluate_result = {'acc': self.acc_count/self.total} + evaluate_result = {'acc': round(self.acc_count/self.total, 6)} if reset: self.acc_count = 0 self.total = 0 diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 0c3bcefb..0e30ab9b 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -17,7 +17,7 @@ class Tester(object): """An collection of model inference and evaluation of performance, used over validation/dev set and test set. """ - def __init__(self, data, model, metrics, batch_size=16, use_cuda=False, verbose=0): + def __init__(self, data, model, metrics, batch_size=16, use_cuda=False, verbose=1): super(Tester, self).__init__() if not isinstance(data, DataSet): @@ -76,7 +76,7 @@ def test(self): _check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature, check_res=e.check_res, output=output, batch_y=truths, check_level=0) - if self.verbose >= 0: + if self.verbose >= 1: print("[tester] \n{}".format(self._format_eval_results(eval_results))) self._mode(network, is_test=False) return eval_results @@ -107,7 +107,7 @@ def _format_eval_results(self, results): """ _str = '' for metric_name, metric_result in results.items(): - _str += metric_name + '\n\t' - _str += ", ".join([str(key) + "=" + str(value) for key, value in results.items()]) - _str += '\n' - return _str + _str += metric_name + ': ' + _str += ", ".join([str(key) + "=" + str(value) for key, value in metric_result.items()]) + _str += '\n' + return _str[:-1] diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 2cf18b90..20d54073 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -28,9 +28,9 @@ class Trainer(object): """ - def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, print_every=-1, + def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, print_every=50, validate_every=-1, - dev_data=None, use_cuda=False, save_path="./save", + dev_data=None, use_cuda=False, save_path=None, optimizer=Adam(lr=0.01, weight_decay=0), check_code_level=0, metric_key=None, **kwargs): @@ -307,8 +307,8 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ _move_dict_value_to_device(batch_x, batch_y, device=model_devcie) # forward check if batch_count==0: - _check_forward_error(forward_func=model.forward, check_level=check_level, - batch_x=batch_x) + _check_forward_error(forward_func=model.forward, dataset=dataset, + batch_x=batch_x, check_level=check_level) refined_batch_x = _build_args(model.forward, **batch_x) output = model(**refined_batch_x) diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index c9cd7c03..95297a54 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -207,7 +207,7 @@ class CheckError(Exception): CheckError. Used in losses.LossBase, metrics.MetricBase. """ def __init__(self, check_res:CheckRes, func_signature:str): - errs = [f'The following problems occurred when calling {func_signature}'] + errs = [f'The following problems occurred when calling `{func_signature}`'] if check_res.varargs: errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, please delete it)") @@ -255,7 +255,7 @@ def _check_loss_evaluate(prev_func_signature:str, func_signature:str, check_res: warnings.warn(message=_unused_warn) -def _check_forward_error(forward_func, batch_x, check_level): +def _check_forward_error(forward_func, batch_x, dataset, check_level): check_res = _check_arg_dict_list(forward_func, batch_x) func_signature = get_func_signature(forward_func) From cc440b5ed6596c6a677e7debc8e820431a923f75 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Mon, 3 Dec 2018 11:12:56 +0800 Subject: [PATCH 36/67] =?UTF-8?q?All=20tests=20pass.=20*=20=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E6=B5=8B=E8=AF=95=E4=BB=A3=E7=A0=81=EF=BC=8C=E8=B7=91?= =?UTF-8?q?=E9=80=9A=E6=89=80=E6=9C=89=E6=B5=8B=E8=AF=95=EF=BC=8C=E8=A6=86?= =?UTF-8?q?=E7=9B=96=E7=8E=8765%=20*=20refine=E4=BB=A3=E7=A0=81=E8=A7=84?= =?UTF-8?q?=E8=8C=83=E5=92=8C=E6=9F=90=E4=BA=9B=E6=B3=A8=E9=87=8A=20*=20fi?= =?UTF-8?q?x=20tester=20self.use=5Fcuda=E6=9C=AA=E8=B5=8B=E5=80=BC?= =?UTF-8?q?=E5=85=88=E4=BD=BF=E7=94=A8=E7=9A=84bug=20*=20=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0tutorial=E6=A0=B7=E4=BE=8B=E6=95=B0=E6=8D=AE=E2=80=94?= =?UTF-8?q?=E2=80=94tutorial=5Fsample=5Fdataset.csv=20*=20=E3=80=90unsolve?= =?UTF-8?q?d=E3=80=91embed=5Floader=E5=9C=A8=E8=AE=A1=E7=AE=97np.cov?= =?UTF-8?q?=E6=97=B6=E9=81=87=E5=88=B0segmentation=20fault?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/batch.py | 5 ++- fastNLP/core/dataset.py | 13 +++++-- fastNLP/core/tester.py | 9 +++-- test/core/test_batch.py | 4 +- test/data_for_tests/glove.6B.50d_test.txt | 6 +-- .../tutorial_sample_dataset.csv | 38 +++++++++++++++++++ test/io/test_embed_loader.py | 6 +-- test/test_tutorial.py | 4 +- 8 files changed, 64 insertions(+), 21 deletions(-) create mode 100644 test/data_for_tests/tutorial_sample_dataset.csv diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 2e77e3f7..a4d7a8ae 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -1,5 +1,5 @@ -import torch import numpy as np +import torch class Batch(object): @@ -60,9 +60,10 @@ def __next__(self): def __len__(self): return self.num_batches + def to_tensor(batch, dtype): if dtype in (np.int8, np.int16, np.int32, np.int64): batch = torch.LongTensor(batch) if dtype in (np.float32, np.float64): batch = torch.FloatTensor(batch) - return batch \ No newline at end of file + return batch diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 3269cef3..749d3e74 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -174,7 +174,7 @@ def rename_field(self, old_name, new_name): self.field_arrays[new_name] = self.field_arrays.pop(old_name) self.field_arrays[new_name].name = new_name else: - raise KeyError("{} is not a valid name. ".format(old_name)) + raise KeyError("DataSet has no field named {}.".format(old_name)) def set_target(self, *field_names, flag=True): """Change the target flag of these fields. @@ -208,8 +208,6 @@ def get_target_name(self): @classmethod def set_reader(cls, method_name): - """decorator to add dataloader support - """ assert isinstance(method_name, str) def wrapper(read_cls): @@ -275,6 +273,15 @@ def split(self, dev_ratio): @classmethod def read_csv(cls, csv_path, headers=None, sep=",", dropna=True): + """Load data from a CSV file and return a DataSet object. + + :param str csv_path: path to the CSV file + :param List[str] or Tuple[str] headers: headers of the CSV file + :param str sep: delimiter in CSV file. Default: "," + :param bool dropna: If True, drop rows that have less entries than headers. + :return DataSet dataset: + + """ with open(csv_path, "r") as f: start_idx = 0 if headers is None: diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 0e30ab9b..2e12e757 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -28,15 +28,16 @@ def __init__(self, data, model, metrics, batch_size=16, use_cuda=False, verbose= self.metrics = _prepare_metrics(metrics) self.data = data - if torch.cuda.is_available() and self.use_cuda: - self._model = model.cuda() - else: - self._model = model self.use_cuda = use_cuda self.batch_size = batch_size self.verbose = verbose self._model_device = model.parameters().__next__().device + if torch.cuda.is_available() and self.use_cuda: + self._model = model.cuda() + else: + self._model = model + # check predict if hasattr(self._model, 'predict'): self._predict_func = self._model.predict diff --git a/test/core/test_batch.py b/test/core/test_batch.py index 6aa88b0b..08d803f1 100644 --- a/test/core/test_batch.py +++ b/test/core/test_batch.py @@ -22,8 +22,8 @@ def test_simple(self): def test_dataset_batching(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) - ds.set_input(x=True) - ds.set_target(y=True) + ds.set_input("x") + ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray)) diff --git a/test/data_for_tests/glove.6B.50d_test.txt b/test/data_for_tests/glove.6B.50d_test.txt index 8b443cca..707e48e8 100644 --- a/test/data_for_tests/glove.6B.50d_test.txt +++ b/test/data_for_tests/glove.6B.50d_test.txt @@ -1,10 +1,6 @@ the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581 -, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 -0.42852 -0.55641 -0.364 -0.23938 0.13001 -0.063734 -0.39575 -0.48162 0.23291 0.090201 -0.13324 0.078639 -0.41634 -0.15428 0.10068 0.48891 0.31226 -0.1252 -0.037512 -1.5179 0.12612 -0.02442 -0.042961 -0.28351 3.5416 -0.11956 -0.014533 -0.1499 0.21864 -0.33412 -0.13872 0.31806 0.70358 0.44858 -0.080262 0.63003 0.32111 -0.46765 0.22786 0.36034 -0.37818 -0.56657 0.044691 0.30392 -. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 -0.43478 -0.31086 -0.44999 -0.29486 0.16608 0.11963 -0.41328 -0.42353 0.59868 0.28825 -0.11547 -0.041848 -0.67989 -0.25063 0.18472 0.086876 0.46582 0.015035 0.043474 -1.4671 -0.30384 -0.023441 0.30589 -0.21785 3.746 0.0042284 -0.18436 -0.46209 0.098329 -0.11907 0.23919 0.1161 0.41705 0.056763 -6.3681e-05 0.068987 0.087939 -0.10285 -0.13931 0.22314 -0.080803 -0.35652 0.016413 0.10216 of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 0.18157 -0.52393 0.10381 -0.17566 0.078852 -0.36216 -0.11829 -0.83336 0.11917 -0.16605 0.061555 -0.012719 -0.56623 0.013616 0.22851 -0.14396 -0.067549 -0.38157 -0.23698 -1.7037 -0.86692 -0.26704 -0.2589 0.1767 3.8676 -0.1613 -0.13273 -0.68881 0.18444 0.0052464 -0.33874 -0.078956 0.24185 0.36576 -0.34727 0.28483 0.075693 -0.062178 -0.38988 0.22902 -0.21617 -0.22562 -0.093918 -0.80375 to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 -0.41376 0.13228 -0.29847 -0.085253 0.17118 0.22419 -0.10046 -0.43653 0.33418 0.67846 0.057204 -0.34448 -0.42785 -0.43275 0.55963 0.10032 0.18677 -0.26854 0.037334 -2.0932 0.22171 -0.39868 0.20912 -0.55725 3.8826 0.47466 -0.95658 -0.37788 0.20869 -0.32752 0.12751 0.088359 0.16351 -0.21634 -0.094375 0.018324 0.21048 -0.03088 -0.19722 0.082279 -0.09434 -0.073297 -0.064699 -0.26044 and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 -0.51332 -0.47368 -0.33075 -0.13834 0.2702 0.30938 -0.45012 -0.4127 -0.09932 0.038085 0.029749 0.10076 -0.25058 -0.51818 0.34558 0.44922 0.48791 -0.080866 -0.10121 -1.3777 -0.10866 -0.23201 0.012839 -0.46508 3.8463 0.31362 0.13643 -0.52244 0.3302 0.33707 -0.35601 0.32431 0.12041 0.3512 -0.069043 0.36885 0.25168 -0.24517 0.25381 0.1367 -0.31178 -0.6321 -0.25028 -0.38097 in 0.33042 0.24995 -0.60874 0.10923 0.036372 0.151 -0.55083 -0.074239 -0.092307 -0.32821 0.09598 -0.82269 -0.36717 -0.67009 0.42909 0.016496 -0.23573 0.12864 -1.0953 0.43334 0.57067 -0.1036 0.20422 0.078308 -0.42795 -1.7984 -0.27865 0.11954 -0.12689 0.031744 3.8631 -0.17786 -0.082434 -0.62698 0.26497 -0.057185 -0.073521 0.46103 0.30862 0.12498 -0.48609 -0.0080272 0.031184 -0.36576 -0.42699 0.42164 -0.11666 -0.50703 -0.027273 -0.53285 -a 0.21705 0.46515 -0.46757 0.10082 1.0135 0.74845 -0.53104 -0.26256 0.16812 0.13182 -0.24909 -0.44185 -0.21739 0.51004 0.13448 -0.43141 -0.03123 0.20674 -0.78138 -0.20148 -0.097401 0.16088 -0.61836 -0.18504 -0.12461 -2.2526 -0.22321 0.5043 0.32257 0.15313 3.9636 -0.71365 -0.67012 0.28388 0.21738 0.14433 0.25926 0.23434 0.4274 -0.44451 0.13813 0.36973 -0.64289 0.024142 -0.039315 -0.26037 0.12017 -0.043782 0.41013 0.1796 -" 0.25769 0.45629 -0.76974 -0.37679 0.59272 -0.063527 0.20545 -0.57385 -0.29009 -0.13662 0.32728 1.4719 -0.73681 -0.12036 0.71354 -0.46098 0.65248 0.48887 -0.51558 0.039951 -0.34307 -0.014087 0.86488 0.3546 0.7999 -1.4995 -1.8153 0.41128 0.23921 -0.43139 3.6623 -0.79834 -0.54538 0.16943 -0.82017 -0.3461 0.69495 -1.2256 -0.17992 -0.057474 0.030498 -0.39543 -0.38515 -1.0002 0.087599 -0.31009 -0.34677 -0.31438 0.75004 0.97065 -'s 0.23727 0.40478 -0.20547 0.58805 0.65533 0.32867 -0.81964 -0.23236 0.27428 0.24265 0.054992 0.16296 -1.2555 -0.086437 0.44536 0.096561 -0.16519 0.058378 -0.38598 0.086977 0.0033869 0.55095 -0.77697 -0.62096 0.092948 -2.5685 -0.67739 0.10151 -0.48643 -0.057805 3.1859 -0.017554 -0.16138 0.055486 -0.25885 -0.33938 -0.19928 0.26049 0.10478 -0.55934 -0.12342 0.65961 -0.51802 -0.82995 -0.082739 0.28155 -0.423 -0.27378 -0.007901 -0.030231 +a 0.21705 0.46515 -0.46757 0.10082 1.0135 0.74845 -0.53104 -0.26256 0.16812 0.13182 -0.24909 -0.44185 -0.21739 0.51004 0.13448 -0.43141 -0.03123 0.20674 -0.78138 -0.20148 -0.097401 0.16088 -0.61836 -0.18504 -0.12461 -2.2526 -0.22321 0.5043 0.32257 0.15313 3.9636 -0.71365 -0.67012 0.28388 0.21738 0.14433 0.25926 0.23434 0.4274 -0.44451 0.13813 0.36973 -0.64289 0.024142 -0.039315 -0.26037 0.12017 -0.043782 0.41013 0.1796 \ No newline at end of file diff --git a/test/data_for_tests/tutorial_sample_dataset.csv b/test/data_for_tests/tutorial_sample_dataset.csv new file mode 100644 index 00000000..c3137854 --- /dev/null +++ b/test/data_for_tests/tutorial_sample_dataset.csv @@ -0,0 +1,38 @@ +A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . 1 +This quiet , introspective and entertaining independent is worth seeking . 4 +Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one . 1 +A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera . 3 +Aggressive self-glorification and a manipulative whitewash . 1 +A comedy-drama of nearly epic proportions rooted in a sincere performance by the title character undergoing midlife crisis . 4 +Narratively , Trouble Every Day is a plodding mess . 1 +The Importance of Being Earnest , so thick with wit it plays like a reading from Bartlett 's Familiar Quotations 3 +But it does n't leave you with much . 1 +You could hate it for the same reason . 1 +There 's little to recommend Snow Dogs , unless one considers cliched dialogue and perverse escapism a source of high hilarity . 1 +Kung Pow is Oedekerk 's realization of his childhood dream to be in a martial-arts flick , and proves that sometimes the dreams of youth should remain just that . 1 +The performances are an absolute joy . 4 +Fresnadillo has something serious to say about the ways in which extravagant chance can distort our perspective and throw us off the path of good sense . 3 +I still like Moonlight Mile , better judgment be damned . 3 +A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3 +a bilingual charmer , just like the woman who inspired it 3 +Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2 +As inept as big-screen remakes of The Avengers and The Wild Wild West . 1 +It 's everything you 'd expect -- but nothing more . 2 +Best indie of the year , so far . 4 +Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3 +It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1 +That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2 +The plot is romantic comedy boilerplate from start to finish . 2 +It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2 +A film that clearly means to preach exclusively to the converted . 2 +While The Importance of Being Earnest offers opportunities for occasional smiles and chuckles , it does n't give us a reason to be in the theater beyond Wilde 's wit and the actors ' performances . 1 +The latest vapid actor 's exercise to appropriate the structure of Arthur Schnitzler 's Reigen . 1 +More vaudeville show than well-constructed narrative , but on those terms it 's inoffensive and actually rather sweet . 2 +Nothing more than a run-of-the-mill action flick . 2 +Hampered -- no , paralyzed -- by a self-indulgent script ... that aims for poetry and ends up sounding like satire . 0 +Ice Age is the first computer-generated feature cartoon to feel like other movies , and that makes for some glacial pacing early on . 2 +There 's very little sense to what 's going on here , but the makers serve up the cliches with considerable dash . 2 +Cattaneo should have followed the runaway success of his first film , The Full Monty , with something different . 2 +They 're the unnamed , easily substitutable forces that serve as whatever terror the heroes of horror movies try to avoid . 1 +It almost feels as if the movie is more interested in entertaining itself than in amusing us . 1 +The movie 's progression into rambling incoherence gives new meaning to the phrase ` fatal script error . ' 0 \ No newline at end of file diff --git a/test/io/test_embed_loader.py b/test/io/test_embed_loader.py index 0a7c4fcf..fc1e7124 100644 --- a/test/io/test_embed_loader.py +++ b/test/io/test_embed_loader.py @@ -1,12 +1,12 @@ import unittest from fastNLP.core.vocabulary import Vocabulary -from fastNLP.io.embed_loader import EmbedLoader class TestEmbedLoader(unittest.TestCase): def test_case(self): vocab = Vocabulary() vocab.update(["the", "in", "I", "to", "of", "hahaha"]) - embedding = EmbedLoader().fast_load_embedding(50, "../data_for_tests/glove.6B.50d_test.txt", vocab) - self.assertEqual(tuple(embedding.shape), (len(vocab), 50)) + # TODO: np.cov在linux上segment fault,原因未知 + # embedding = EmbedLoader().fast_load_embedding(50, "../data_for_tests/glove.6B.50d_test.txt", vocab) + # self.assertEqual(tuple(embedding.shape), (len(vocab), 50)) diff --git a/test/test_tutorial.py b/test/test_tutorial.py index 05338514..fe6a9d86 100644 --- a/test/test_tutorial.py +++ b/test/test_tutorial.py @@ -12,7 +12,8 @@ class TestTutorial(unittest.TestCase): def test_tutorial(self): # 从csv读取数据到DataSet - dataset = DataSet.read_csv("./data_for_tests/tutorial_sample_dataset.csv", headers=('raw_sentence', 'label'), + sample_path = "test/data_for_tests/tutorial_sample_dataset.csv" + dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'), sep='\t') print(len(dataset)) print(dataset[0]) @@ -88,7 +89,6 @@ def split_sent(ins): print('Train finished!') # 使用fastNLP的Tester测试脚本 - tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"), batch_size=4) acc = tester.test() From 77f8ac77daa414908ed90d477e4ae5217c092f76 Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 3 Dec 2018 12:12:48 +0800 Subject: [PATCH 37/67] =?UTF-8?q?=E5=AF=B9trainer=E4=B8=ADcheck=20code?= =?UTF-8?q?=E7=9A=84=E6=8A=A5=E9=94=99=E4=BF=A1=E6=81=AF=E8=BF=9B=E8=A1=8C?= =?UTF-8?q?=E4=BA=86=E5=A2=9E=E5=BC=BA=EF=BC=9B=E5=B0=86tester=E4=B8=AD?= =?UTF-8?q?=E7=9A=84output=E4=BF=AE=E6=94=B9=E4=B8=BApred=5Fdict?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/metrics.py | 6 +-- fastNLP/core/tester.py | 12 ++--- fastNLP/core/trainer.py | 12 ++--- fastNLP/core/utils.py | 107 ++++++++++++++++++++++++++++++++-------- 4 files changed, 102 insertions(+), 35 deletions(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 070b1d17..b1fc110b 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -96,7 +96,7 @@ def __call__(self, pred_dict, target_dict, check=False): will be conducted) :param pred_dict: usually the output of forward or prediction function :param target_dict: usually features set as target.. - :param check: boolean, if check is True, it will force check `varargs, missing, unsed, duplicated`. + :param check: boolean, if check is True, it will force check `varargs, missing, unused, duplicated`. :return: """ if not callable(self.evaluate): @@ -148,8 +148,8 @@ def __call__(self, pred_dict, target_dict, check=False): missing = check_res.missing replaced_missing = list(missing) for idx, func_arg in enumerate(missing): - replaced_missing[idx] = f"`{self.param_map[func_arg]}`" + f"(assign to `{func_arg}` " \ - f"in `{get_func_signature(self.evaluate)}`)" + replaced_missing[idx] = f"{self.param_map[func_arg]}" + f"(assign to `{func_arg}` " \ + f"in `{self.__class__.__name__}`)" check_res = CheckRes(missing=replaced_missing, unused=check_res.unused, diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 0e30ab9b..0ff724c0 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -51,19 +51,18 @@ def test(self): # turn on the testing mode; clean up the history network = self._model self._mode(network, is_test=True) - output, truths = defaultdict(list), defaultdict(list) data_iterator = Batch(self.data, self.batch_size, sampler=SequentialSampler(), as_numpy=False) eval_results = {} try: with torch.no_grad(): for batch_x, batch_y in data_iterator: _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) - prediction = self._data_forward(self._predict_func, batch_x) - if not isinstance(prediction, dict): + pred_dict = self._data_forward(self._predict_func, batch_x) + if not isinstance(pred_dict, dict): raise TypeError(f"The return value of {get_func_signature(self._predict_func)} " - f"must be `dict`, got {type(prediction)}.") + f"must be `dict`, got {type(pred_dict)}.") for metric in self.metrics: - metric(prediction, batch_y) + metric(pred_dict, batch_y) for metric in self.metrics: eval_result = metric.get_metric() if not isinstance(eval_result, dict): @@ -74,7 +73,8 @@ def test(self): except CheckError as e: prev_func_signature = get_func_signature(self._predict_func) _check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature, - check_res=e.check_res, output=output, batch_y=truths, check_level=0) + check_res=e.check_res, pred_dict=pred_dict, target_dict=batch_y, + dataset=self.data, check_level=0) if self.verbose >= 1: print("[tester] \n{}".format(self._format_eval_results(eval_results))) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 20d54073..b24af193 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -311,14 +311,14 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ batch_x=batch_x, check_level=check_level) refined_batch_x = _build_args(model.forward, **batch_x) - output = model(**refined_batch_x) + pred_dict = model(**refined_batch_x) func_signature = get_func_signature(model.forward) - if not isinstance(output, dict): - raise TypeError(f"The return value of {func_signature} should be `dict`, not `{type(output)}`.") + if not isinstance(pred_dict, dict): + raise TypeError(f"The return value of {func_signature} should be `dict`, not `{type(pred_dict)}`.") # loss check try: - loss = losser(output, batch_y) + loss = losser(pred_dict, batch_y) # check loss output if batch_count == 0: if not isinstance(loss, torch.Tensor): @@ -333,8 +333,8 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ except CheckError as e: pre_func_signature = get_func_signature(model.forward) _check_loss_evaluate(prev_func_signature=pre_func_signature, func_signature=e.func_signature, - check_res=e.check_res, output=output, batch_y=batch_y, - check_level=check_level) + check_res=e.check_res, pred_dict=pred_dict, target_dict=batch_y, + dataset=dataset, check_level=check_level) model.zero_grad() if batch_count + 1 >= DEFAULT_CHECK_NUM_BATCH: break diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 95297a54..bfbeb6e5 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -229,29 +229,72 @@ def __init__(self, check_res:CheckRes, func_signature:str): STRICT_CHECK_LEVEL = 2 def _check_loss_evaluate(prev_func_signature:str, func_signature:str, check_res:CheckRes, - output:dict, batch_y:dict, check_level=0): + pred_dict:dict, target_dict:dict, dataset, check_level=0): errs = [] - _unused = [] + unuseds = [] + _unused_field = [] + _unused_param = [] + suggestions = [] if check_res.varargs: - errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, " - f"please delete it.)") + errs.append(f"\tvarargs: *{check_res.varargs}") + suggestions.append(f"Does not support pass positional arguments, please delete *{check_res.varargs}.") + + if check_res.unused: + for _unused in check_res.unused: + if _unused in target_dict: + _unused_field.append(_unused) + else: + _unused_param.append(_unused) + if _unused_field: + unuseds.append([f"\tunused field: {_unused_field}"]) + if _unused_param: + unuseds.append([f"\tunused param: {_unused_param}"]) + if check_res.missing: - errs.append(f"\tmissing param: `{check_res.missing}`, provided with `{list(output.keys())}`" - f"(from output of `{prev_func_signature}`) and `{list(batch_y.keys())}`(from targets in Dataset).") + errs.append(f"\tmissing param: {check_res.missing}") + _miss_in_dataset = [] + _miss_out_dataset = [] + for _miss in check_res.missing: + if '(' in _miss: + # if they are like 'SomeParam(assign to xxx)' + _miss = _miss.split('(')[0] + if _miss in dataset: + _miss_in_dataset.append(_miss) + else: + _miss_out_dataset.append(_miss) + + if _miss_in_dataset: + suggestions.append(f"You might need to set {_miss_in_dataset} as target(Right now " + f"target is {list(target_dict.keys())}).") + if _miss_out_dataset: + _tmp = (f"You might need to provide {_miss_out_dataset} in DataSet and set it as target(Right now " + f"target is {list(target_dict.keys())}) or output it " + f"in {prev_func_signature}(Right now it outputs {list(pred_dict.keys())}).") + if _unused_field: + _tmp += f"You can use DataSet.rename_field() to rename the field in `unused field:`. " + suggestions.append(_tmp) + if check_res.duplicated: - errs.append(f"\tduplicated param: {check_res.duplicated}, delete {check_res.duplicated} in the output of " - f"{check_res.duplicated} or do not set {check_res.duplicated} as targets. ") - if check_res.unused: - _unused = [f"\tunused param: {check_res.unused}"] - if check_level == STRICT_CHECK_LEVEL: - errs.extend(_unused) + errs.append(f"\tduplicated param: {check_res.duplicated}.") + suggestions.append(f"Delete {check_res.duplicated} in the output of " + f"{prev_func_signature} or do not set {check_res.duplicated} as targets. ") + + if check_level == STRICT_CHECK_LEVEL: + errs.extend(unuseds) if len(errs)>0: errs.insert(0, f'The following problems occurred when calling {func_signature}') - raise NameError('\n'.join(errs)) - if _unused: + sugg_str = "" + if len(suggestions)>1: + for idx, sugg in enumerate(suggestions): + sugg_str += f'({idx+1}). {sugg}' + else: + sugg_str += suggestions[0] + err_str = '\n' + '\n'.join(errs) + '\n\tSuggestion: ' + sugg_str + raise NameError(err_str) + if check_res.unused: if check_level == WARNING_CHECK_LEVEL: - _unused_warn = _unused[0] + f' in {func_signature}.' + _unused_warn = f'{check_res.unused} is not used by {func_signature}.' warnings.warn(message=_unused_warn) @@ -260,21 +303,45 @@ def _check_forward_error(forward_func, batch_x, dataset, check_level): func_signature = get_func_signature(forward_func) errs = [] + suggestions = [] _unused = [] if check_res.varargs: - errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, please delete it)") + errs.append(f"\tvarargs: {check_res.varargs}") + suggestions.append(f"Does not support pass positional arguments, please delete *{check_res.varargs}.") if check_res.missing: - errs.append(f"\tmissing param: {check_res.missing}, provided with {list(batch_x.keys())}. " - f"Please set {check_res.missing} as input.") + errs.append(f"\tmissing param: {check_res.missing}") + _miss_in_dataset = [] + _miss_out_dataset = [] + for _miss in check_res.missing: + if _miss in dataset: + _miss_in_dataset.append(_miss) + else: + _miss_out_dataset.append(_miss) + if _miss_in_dataset: + suggestions.append(f"You might need to set {_miss_in_dataset} as input. ") + if _miss_out_dataset: + _tmp = f"You need to provide {_miss_out_dataset} in DataSet and set it as input. " + if check_res.unused: + _tmp += f"Or you might find it is in `unused field:`, you can use DataSet.rename_field() to " \ + f"rename the field in `unused field:`." + suggestions.append(_tmp) + if check_res.unused: - _unused = [f"\tunused param: {check_res.unused}"] + _unused = [f"\tunused field: {check_res.unused}"] if check_level == STRICT_CHECK_LEVEL: errs.extend(_unused) if len(errs)>0: errs.insert(0, f'The following problems occurred when calling {func_signature}') - raise NameError('\n'.join(errs)) + sugg_str = "" + if len(suggestions)>1: + for idx, sugg in enumerate(suggestions): + sugg_str += f'({idx+1}). {sugg}' + else: + sugg_str += suggestions[0] + err_str = '\n' + '\n'.join(errs) + '\n\tSuggestion: ' + sugg_str + raise NameError(err_str) if _unused: if check_level == WARNING_CHECK_LEVEL: _unused_warn = _unused[0] + f' in {func_signature}.' From f62060339edd1da3c3e1092057e014757714d28a Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Mon, 3 Dec 2018 12:37:33 +0800 Subject: [PATCH 38/67] =?UTF-8?q?All=20tests=20pass.=20Ready=20to=20merge.?= =?UTF-8?q?=20*=20=E6=9B=B4=E6=96=B0Loss=E7=9A=84=E6=8E=A5=E5=8F=A3?= =?UTF-8?q?=E5=BD=A2=E5=8F=82=E8=B7=9Fmetric=E4=BF=9D=E6=8C=81=E4=B8=80?= =?UTF-8?q?=E8=87=B4=20*=20=E6=B7=BB=E5=8A=A0=E5=AF=B9=E5=87=A0=E7=A7=8Dlo?= =?UTF-8?q?ss=E7=9A=84=E6=B5=8B=E8=AF=95=20*=20embed=5Floader=E9=87=87?= =?UTF-8?q?=E7=94=A8=E7=BB=B4=E5=BA=A6=E7=8B=AC=E7=AB=8B=E7=9A=84=E6=96=B9?= =?UTF-8?q?=E6=B3=95=E9=87=87=E6=A0=B7=20*=20=E5=AF=B9=E5=BA=94=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E4=BB=A3=E7=A0=81=E7=9A=84=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/losses.py | 31 +++++++++++++++------------- fastNLP/io/embed_loader.py | 6 +++--- test/core/test_loss.py | 40 +++++++++++++++--------------------- test/core/test_trainer.py | 2 +- test/io/test_embed_loader.py | 6 +++--- test/test_tutorial.py | 4 ++-- 6 files changed, 42 insertions(+), 47 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index dce568bd..64ad8e23 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -70,11 +70,11 @@ def _init_param_map(self, key_map=None, **kwargs): raise NameError(f"Delete `*{func_spect.varargs}` in {get_func_signature(self.get_loss)}(Do not use " f"positional argument.).") - def __call__(self, output_dict, target_dict, force_check=False): + def __call__(self, pred_dict, target_dict, check=False): """ - :param output_dict: A dict from forward function of the network. + :param pred_dict: A dict from forward function of the network. :param target_dict: A dict from DataSet.batch_y. - :param force_check: Boolean. Force to check the mapping functions when it is running. + :param check: Boolean. Force to check the mapping functions when it is running. :return: """ args, defaults, defaults_val, varargs, kwargs = _get_arg_list(self.get_loss) @@ -88,7 +88,8 @@ def __call__(self, output_dict, target_dict, force_check=False): raise RuntimeError( f"There is not any param in function{get_func_signature(self.get_loss)}" ) - self._checked = self._checked and not force_check + + self._checked = self._checked and not check if not self._checked: for keys in args: if keys not in param_map: @@ -105,12 +106,12 @@ def __call__(self, output_dict, target_dict, force_check=False): duplicated = [] missing = [] if not self._checked: - for keys, val in output_dict.items(): + for keys, val in pred_dict.items(): if keys in target_dict.keys(): duplicated.append(keys) param_val_dict = {} - for keys, val in output_dict.items(): + for keys, val in pred_dict.items(): param_val_dict.update({keys: val}) for keys, val in target_dict.items(): param_val_dict.update({keys: val}) @@ -158,29 +159,31 @@ def __init__(self, func, key_map=None, **kwargs): class CrossEntropyLoss(LossBase): - def __init__(self, input=None, target=None): + def __init__(self, pred=None, target=None): super(CrossEntropyLoss, self).__init__() self.get_loss = F.cross_entropy - self._init_param_map(input=input, target=target) + self._init_param_map(input=pred, target=target) class L1Loss(LossBase): - def __init__(self): + def __init__(self, pred=None, target=None): super(L1Loss, self).__init__() self.get_loss = F.l1_loss + self._init_param_map(input=pred, target=target) class BCELoss(LossBase): - def __init__(self, input=None, target=None): + def __init__(self, pred=None, target=None): super(BCELoss, self).__init__() self.get_loss = F.binary_cross_entropy - self._init_param_map(input=input, target=target) + self._init_param_map(input=pred, target=target) class NLLLoss(LossBase): - def __init__(self): + def __init__(self, pred=None, target=None): super(NLLLoss, self).__init__() self.get_loss = F.nll_loss + self._init_param_map(input=pred, target=target) class LossInForward(LossBase): @@ -200,9 +203,9 @@ def get_loss(self, **kwargs): varargs=[]) raise CheckError(check_res=check_res, func_signature=get_func_signature(self.get_loss)) - def __call__(self, output_dict, predict_dict, force_check=False): + def __call__(self, pred_dict, target_dict, check=False): - loss = self.get_loss(**output_dict) + loss = self.get_loss(**pred_dict) if not (isinstance(loss, torch.Tensor) and len(loss.size()) == 0): if not isinstance(loss, torch.Tensor): diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index 6e557c2b..779b7fd0 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -105,9 +105,9 @@ def fast_load_embedding(emb_dim, emb_file, vocab): if np.sum(hit_flags) < len(vocab): # some words from vocab are missing in pre-trained embedding - # we normally sample them + # we normally sample each dimension vocab_embed = embedding_matrix[np.where(hit_flags)] - mean, cov = vocab_embed.mean(axis=0), np.cov(vocab_embed.T) - sampled_vectors = np.random.multivariate_normal(mean, cov, size=(len(vocab) - np.sum(hit_flags),)) + sampled_vectors = np.random.normal(vocab_embed.mean(axis=0), vocab_embed.std(axis=0), + size=(len(vocab) - np.sum(hit_flags), emb_dim)) embedding_matrix[np.where(1 - hit_flags)] = sampled_vectors return embedding_matrix diff --git a/test/core/test_loss.py b/test/core/test_loss.py index 1124860b..9b77d0a1 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -271,40 +271,32 @@ def func6(a, b, **kwargs): loss3 = get_loss_3({'predict': predict}, {'truth': truth}) assert loss1 == loss2 and loss1 == loss3 - """ - get_loss_4 = LossFunc(func4) - loss4 = get_loss_4({'a': 1, 'b': 3}, {}) - print(loss4) - assert loss4 == (1 + 3) * 2 - - get_loss_5 = LossFunc(func4) - loss5 = get_loss_5({'a': 1, 'b': 3}, {'c': 4}) - print(loss5) - assert loss5 == (1 + 3) * 4 - - get_loss_6 = LossFunc(func6) - loss6 = get_loss_6({'a': 1, 'b': 3}, {'c': 4}) - print(loss6) - assert loss6 == (1 + 3) * 4 - - get_loss_7 = LossFunc(func6, c='cc') - loss7 = get_loss_7({'a': 1, 'b': 3}, {'cc': 4}) - print(loss7) - assert loss7 == (1 + 3) * 4 - """ - class TestLoss_v2(unittest.TestCase): def test_CrossEntropyLoss(self): - ce = loss.CrossEntropyLoss(input="my_predict", target="my_truth") + ce = loss.CrossEntropyLoss(pred="my_predict", target="my_truth") a = torch.randn(3, 5, requires_grad=False) b = torch.empty(3, dtype=torch.long).random_(5) ans = ce({"my_predict": a}, {"my_truth": b}) self.assertEqual(ans, torch.nn.functional.cross_entropy(a, b)) def test_BCELoss(self): - bce = loss.BCELoss(input="my_predict", target="my_truth") + bce = loss.BCELoss(pred="my_predict", target="my_truth") a = torch.sigmoid(torch.randn((3, 5), requires_grad=False)) b = torch.randn((3, 5), requires_grad=False) ans = bce({"my_predict": a}, {"my_truth": b}) self.assertEqual(ans, torch.nn.functional.binary_cross_entropy(a, b)) + + def test_L1Loss(self): + l1 = loss.L1Loss(pred="my_predict", target="my_truth") + a = torch.randn(3, 5, requires_grad=False) + b = torch.randn(3, 5) + ans = l1({"my_predict": a}, {"my_truth": b}) + self.assertEqual(ans, torch.nn.functional.l1_loss(a, b)) + + def test_NLLLoss(self): + l1 = loss.NLLLoss(pred="my_predict", target="my_truth") + a = F.log_softmax(torch.randn(3, 5, requires_grad=False), dim=0) + b = torch.tensor([1, 0, 4]) + ans = l1({"my_predict": a}, {"my_truth": b}) + self.assertEqual(ans, torch.nn.functional.nll_loss(a, b)) diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index ee4a5770..bc8df2d2 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -32,7 +32,7 @@ def test_case(self): model = NaiveClassifier(2, 1) trainer = Trainer(train_set, model, - losser=BCELoss(input="predict", target="y"), + losser=BCELoss(pred="predict", target="y"), metrics=AccuracyMetric(pred="predict", target="y"), n_epochs=10, batch_size=32, diff --git a/test/io/test_embed_loader.py b/test/io/test_embed_loader.py index fc1e7124..60e3710e 100644 --- a/test/io/test_embed_loader.py +++ b/test/io/test_embed_loader.py @@ -1,12 +1,12 @@ import unittest from fastNLP.core.vocabulary import Vocabulary +from fastNLP.io.embed_loader import EmbedLoader class TestEmbedLoader(unittest.TestCase): def test_case(self): vocab = Vocabulary() vocab.update(["the", "in", "I", "to", "of", "hahaha"]) - # TODO: np.cov在linux上segment fault,原因未知 - # embedding = EmbedLoader().fast_load_embedding(50, "../data_for_tests/glove.6B.50d_test.txt", vocab) - # self.assertEqual(tuple(embedding.shape), (len(vocab), 50)) + embedding = EmbedLoader().fast_load_embedding(50, "test/data_for_tests/glove.6B.50d_test.txt", vocab) + self.assertEqual(tuple(embedding.shape), (len(vocab), 50)) diff --git a/test/test_tutorial.py b/test/test_tutorial.py index fe6a9d86..e7ee5cf6 100644 --- a/test/test_tutorial.py +++ b/test/test_tutorial.py @@ -72,7 +72,7 @@ def split_sent(ins): # 实例化Trainer,传入模型和数据,进行训练 copy_model = deepcopy(model) overfit_trainer = Trainer(model=copy_model, train_data=test_data, dev_data=test_data, - losser=CrossEntropyLoss(input="output", target="label_seq"), + losser=CrossEntropyLoss(pred="output", target="label_seq"), metrics=AccuracyMetric(pred="predict", target="label_seq"), save_path="./save", batch_size=4, @@ -80,7 +80,7 @@ def split_sent(ins): overfit_trainer.train() trainer = Trainer(model=model, train_data=train_data, dev_data=test_data, - losser=CrossEntropyLoss(input="output", target="label_seq"), + losser=CrossEntropyLoss(pred="output", target="label_seq"), metrics=AccuracyMetric(pred="predict", target="label_seq"), save_path="./save", batch_size=4, From 6f58ec34b4357e5df3c7cb467b9906a823a8ca26 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Mon, 3 Dec 2018 19:53:34 +0800 Subject: [PATCH 39/67] =?UTF-8?q?Updates:=20*=20DataSet=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=5F=5Frepr=5F=5F=EF=BC=8C=E4=BC=98=E5=8C=96print(datset)?= =?UTF-8?q?=E7=9A=84=E8=BE=93=E5=87=BA=20*=20Instance=E4=BF=AE=E6=94=B9=5F?= =?UTF-8?q?=5Frepr=5F=5F=EF=BC=8C=E4=BC=98=E5=8C=96print=E7=9A=84=E8=BE=93?= =?UTF-8?q?=E5=87=BA=20*=20Optimizer=E4=BC=98=E5=8C=96=E4=BC=A0=E5=8F=82?= =?UTF-8?q?=E6=8F=90=E7=A4=BA=20*=20Trainer=E5=8E=BB=E9=99=A4kwargs?= =?UTF-8?q?=E5=8F=82=E6=95=B0=20*=20losses.py=E5=8A=A0=E4=B8=AA=E5=8F=82?= =?UTF-8?q?=E6=95=B0=20*=20=E5=AF=B9=E5=BA=94test=20code=E7=9A=84=E4=BF=AE?= =?UTF-8?q?=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/dataset.py | 9 ++++++ fastNLP/core/instance.py | 5 ++- fastNLP/core/losses.py | 1 + fastNLP/core/optimizer.py | 54 ++------------------------------ fastNLP/core/trainer.py | 3 +- test/core/test_dataset.py | 61 +++++++++++++++++++++++++++++++++++++ test/core/test_instance.py | 6 ++++ test/core/test_optimizer.py | 8 ----- 8 files changed, 82 insertions(+), 65 deletions(-) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 749d3e74..40ea0aab 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -110,6 +110,15 @@ def __len__(self): field = iter(self.field_arrays.values()).__next__() return len(field) + def __inner_repr__(self): + if len(self) < 20: + return ",\n".join([ins.__repr__() for ins in self]) + else: + return self[:5].__inner_repr__() + "\n...\n" + self[-5:].__inner_repr__() + + def __repr__(self): + return "DataSet(" + self.__inner_repr__() + ")" + def append(self, ins): """Add an instance to the DataSet. If the DataSet is not empty, the instance must have the same field names as the rest instances in the DataSet. diff --git a/fastNLP/core/instance.py b/fastNLP/core/instance.py index 9dfe8fb8..dc65fa82 100644 --- a/fastNLP/core/instance.py +++ b/fastNLP/core/instance.py @@ -1,5 +1,3 @@ - - class Instance(object): """An Instance is an example of data. It is the collection of Fields. @@ -33,4 +31,5 @@ def __setitem__(self, name, field): return self.add_field(name, field) def __repr__(self): - return self.fields.__repr__() + return "{" + ",\n".join( + "\'" + field_name + "\': " + str(self.fields[field_name]) for field_name in self.fields) + "}" diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 64ad8e23..5f05eab1 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -202,6 +202,7 @@ def get_loss(self, **kwargs): all_needed=[], varargs=[]) raise CheckError(check_res=check_res, func_signature=get_func_signature(self.get_loss)) + return kwargs[self.loss_key] def __call__(self, pred_dict, target_dict, check=False): diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py index 5075fa02..692ff003 100644 --- a/fastNLP/core/optimizer.py +++ b/fastNLP/core/optimizer.py @@ -10,34 +10,7 @@ def __init__(self, model_params, **kwargs): class SGD(Optimizer): - def __init__(self, *args, **kwargs): - model_params, lr, momentum = None, 0.01, 0.9 - if len(args) == 0 and len(kwargs) == 0: - # SGD() - pass - elif len(args) == 1 and len(kwargs) == 0: - if isinstance(args[0], float) or isinstance(args[0], int): - # SGD(0.001) - lr = args[0] - elif hasattr(args[0], "__next__"): - # SGD(model.parameters()) args[0] is a generator - model_params = args[0] - else: - raise RuntimeError("Not supported type {}.".format(type(args[0]))) - elif 2 >= len(kwargs) > 0 and len(args) <= 1: - # SGD(lr=0.01), SGD(lr=0.01, momentum=0.9), SGD(model.parameters(), lr=0.1, momentum=0.9) - if len(args) == 1: - if hasattr(args[0], "__next__"): - model_params = args[0] - else: - raise RuntimeError("Not supported type {}.".format(type(args[0]))) - if not all(key in ("lr", "momentum") for key in kwargs): - raise RuntimeError("Invalid SGD arguments. Expect {}, got {}.".format(("lr", "momentum"), kwargs)) - lr = kwargs.get("lr", 0.01) - momentum = kwargs.get("momentum", 0.9) - else: - raise RuntimeError("SGD only accept 0 or 1 sequential argument, but got {}: {}".format(len(args), args)) - + def __init__(self, model_params=None, lr=0.01, momentum=0): super(SGD, self).__init__(model_params, lr=lr, momentum=momentum) def construct_from_pytorch(self, model_params): @@ -49,30 +22,7 @@ def construct_from_pytorch(self, model_params): class Adam(Optimizer): - def __init__(self, *args, **kwargs): - model_params, lr, weight_decay = None, 0.01, 0.9 - if len(args) == 0 and len(kwargs) == 0: - pass - elif len(args) == 1 and len(kwargs) == 0: - if isinstance(args[0], float) or isinstance(args[0], int): - lr = args[0] - elif hasattr(args[0], "__next__"): - model_params = args[0] - else: - raise RuntimeError("Not supported type {}.".format(type(args[0]))) - elif 2 >= len(kwargs) > 0 and len(args) <= 1: - if len(args) == 1: - if hasattr(args[0], "__next__"): - model_params = args[0] - else: - raise RuntimeError("Not supported type {}.".format(type(args[0]))) - if not all(key in ("lr", "weight_decay") for key in kwargs): - raise RuntimeError("Invalid Adam arguments. Expect {}, got {}.".format(("lr", "weight_decay"), kwargs)) - lr = kwargs.get("lr", 0.01) - weight_decay = kwargs.get("weight_decay", 0.9) - else: - raise RuntimeError("Adam only accept 0 or 1 sequential argument, but got {}: {}".format(len(args), args)) - + def __init__(self, model_params=None, lr=0.01, weight_decay=0): super(Adam, self).__init__(model_params, lr=lr, weight_decay=weight_decay) def construct_from_pytorch(self, model_params): diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index b24af193..5223bbab 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -32,8 +32,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat validate_every=-1, dev_data=None, use_cuda=False, save_path=None, optimizer=Adam(lr=0.01, weight_decay=0), check_code_level=0, - metric_key=None, - **kwargs): + metric_key=None): """ :param DataSet train_data: the training data diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index 786e7248..fa3e1ea3 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -44,6 +44,9 @@ def test_add_append(self): self.assertEqual(dd.field_arrays["y"].content, [[1, 2, 3, 4]] * 10) self.assertEqual(dd.field_arrays["z"].content, [[5, 6]] * 10) + with self.assertRaises(RuntimeError): + dd.add_field("??", [[1, 2]] * 40) + def test_delete_field(self): dd = DataSet() dd.add_field("x", [[1, 2, 3]] * 10) @@ -65,8 +68,66 @@ def test_getitem(self): self.assertTrue(isinstance(sub_ds, DataSet)) self.assertEqual(len(sub_ds), 10) + def test_get_item_error(self): + with self.assertRaises(RuntimeError): + ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) + _ = ds[40:] + + with self.assertRaises(KeyError): + ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) + _ = ds["kom"] + + def test_len_(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) + self.assertEqual(len(ds), 40) + + ds = DataSet() + self.assertEqual(len(ds), 0) + def test_apply(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ds.apply(lambda ins: ins["x"][::-1], new_field_name="rx") self.assertTrue("rx" in ds.field_arrays) self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1]) + + def test_contains(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) + self.assertTrue("x" in ds) + self.assertTrue("y" in ds) + self.assertFalse("z" in ds) + + def test_rename_field(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) + ds.rename_field("x", "xx") + self.assertTrue("xx" in ds) + self.assertFalse("x" in ds) + + with self.assertRaises(KeyError): + ds.rename_field("yyy", "oo") + + def test_input_target(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) + ds.set_input("x") + ds.set_target("y") + self.assertTrue(ds.field_arrays["x"].is_input) + self.assertTrue(ds.field_arrays["y"].is_target) + + with self.assertRaises(KeyError): + ds.set_input("xxx") + with self.assertRaises(KeyError): + ds.set_input("yyy") + + def test_get_input_name(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) + self.assertEqual(ds.get_input_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_input]) + + def test_get_target_name(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) + self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target]) + + +class TestDataSetIter(unittest.TestCase): + def test__repr__(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) + for iter in ds: + self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4], 'y': [5, 6]}") diff --git a/test/core/test_instance.py b/test/core/test_instance.py index abe6b7f7..1342ba2c 100644 --- a/test/core/test_instance.py +++ b/test/core/test_instance.py @@ -27,3 +27,9 @@ def test_get_item(self): self.assertEqual(ins["x"], [1, 2, 3]) self.assertEqual(ins["y"], [4, 5, 6]) self.assertEqual(ins["z"], [1, 1, 1]) + + def test_repr(self): + fields = {"x": [1, 2, 3], "y": [4, 5, 6], "z": [1, 1, 1]} + ins = Instance(**fields) + # simple print, that is enough. + print(ins) diff --git a/test/core/test_optimizer.py b/test/core/test_optimizer.py index ab18b9be..7b29b826 100644 --- a/test/core/test_optimizer.py +++ b/test/core/test_optimizer.py @@ -11,9 +11,6 @@ def test_SGD(self): self.assertTrue("lr" in optim.__dict__["settings"]) self.assertTrue("momentum" in optim.__dict__["settings"]) - optim = SGD(0.001) - self.assertEqual(optim.__dict__["settings"]["lr"], 0.001) - optim = SGD(lr=0.001) self.assertEqual(optim.__dict__["settings"]["lr"], 0.001) @@ -25,17 +22,12 @@ def test_SGD(self): _ = SGD("???") with self.assertRaises(RuntimeError): _ = SGD(0.001, lr=0.002) - with self.assertRaises(RuntimeError): - _ = SGD(lr=0.009, shit=9000) def test_Adam(self): optim = Adam(torch.nn.Linear(10, 3).parameters()) self.assertTrue("lr" in optim.__dict__["settings"]) self.assertTrue("weight_decay" in optim.__dict__["settings"]) - optim = Adam(0.001) - self.assertEqual(optim.__dict__["settings"]["lr"], 0.001) - optim = Adam(lr=0.001) self.assertEqual(optim.__dict__["settings"]["lr"], 0.001) From 131e1ccd3b289388772ea4f1969558119789c33a Mon Sep 17 00:00:00 2001 From: yunfan Date: Mon, 3 Dec 2018 20:04:14 +0800 Subject: [PATCH 40/67] add _fast_param_map --- fastNLP/core/losses.py | 12 +++++++++++- fastNLP/core/metrics.py | 10 +++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 64ad8e23..c3459964 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -70,6 +70,12 @@ def _init_param_map(self, key_map=None, **kwargs): raise NameError(f"Delete `*{func_spect.varargs}` in {get_func_signature(self.get_loss)}(Do not use " f"positional argument.).") + def _fast_param_map(self, pred_dict, target_dict): + if len(self.param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1: + return pred_dict.values[0], target_dict.values[0] + return None + + def __call__(self, pred_dict, target_dict, check=False): """ :param pred_dict: A dict from forward function of the network. @@ -77,6 +83,11 @@ def __call__(self, pred_dict, target_dict, check=False): :param check: Boolean. Force to check the mapping functions when it is running. :return: """ + fast_param = self._fast_param_map(pred_dict, target_dict) + if fast_param is not None: + loss = self.get_loss(*fast_param) + return loss + args, defaults, defaults_val, varargs, kwargs = _get_arg_list(self.get_loss) if varargs is not None: raise RuntimeError( @@ -132,7 +143,6 @@ def __call__(self, pred_dict, target_dict, check=False): param_map_val = _map_args(reversed_param_map, **param_val_dict) param_value = _build_args(self.get_loss, **param_map_val) - loss = self.get_loss(**param_value) if not (isinstance(loss, torch.Tensor) and len(loss.size()) == 0): diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index b1fc110b..6216b16d 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -71,7 +71,7 @@ def _init_param_map(self, key_map=None, **kwargs): def get_metric(self, reset=True): raise NotImplemented - def _fast_call_evaluate(self, pred_dict, target_dict): + def _fast_param_map(self, pred_dict, target_dict): """ Only used as inner function. When the pred_dict, target is unequivocal. Don't need users to pass key_map. @@ -80,7 +80,9 @@ def _fast_call_evaluate(self, pred_dict, target_dict): :param target_dict: :return: boolean, whether to go on codes in self.__call__(). When False, don't go on. """ - return False + if len(self.param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1: + return pred_dict.values[0] and target_dict.values[0] + return None def __call__(self, pred_dict, target_dict, check=False): """ @@ -103,7 +105,9 @@ def __call__(self, pred_dict, target_dict, check=False): raise TypeError(f"{self.__class__.__name__}.evaluate has to be callable, not {type(self.evaluate)}.") if not check: - if self._fast_call_evaluate(pred_dict=pred_dict, target_dict=target_dict): + fast_param = self._fast_param_map(pred_dict=pred_dict, target_dict=target_dict) + if fast_param is not None: + self.evaluate(*fast_param) return if not self._checked: From 513876d5db1f7df2c08ea6984802901383ac3404 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Mon, 3 Dec 2018 20:50:51 +0800 Subject: [PATCH 41/67] =?UTF-8?q?Updates:=20*=20fix=20losses=E7=9A=84=5Ffa?= =?UTF-8?q?st=5Fparam=5Fmap=E7=9A=84bug=20*=20Trainer=E6=B7=BB=E5=8A=A0sam?= =?UTF-8?q?pelr=E5=88=9D=E5=A7=8B=E5=8C=96=E5=8F=82=E6=95=B0=EF=BC=8C?= =?UTF-8?q?=E5=B9=B6=E8=B0=83=E6=95=B4=E5=8F=82=E6=95=B0=E9=A1=BA=E5=BA=8F?= =?UTF-8?q?=20*=20refine=20codes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/losses.py | 3 +-- fastNLP/core/metrics.py | 57 +++++++++++++++++++-------------------- fastNLP/core/trainer.py | 17 ++++-------- fastNLP/core/utils.py | 38 +++++++++++++++----------- test/core/test_trainer.py | 14 +++------- test/test_tutorial.py | 16 +++++------ 6 files changed, 65 insertions(+), 80 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 82f47025..f2fb16d0 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -72,10 +72,9 @@ def _init_param_map(self, key_map=None, **kwargs): def _fast_param_map(self, pred_dict, target_dict): if len(self.param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1: - return pred_dict.values[0], target_dict.values[0] + return tuple(pred_dict.values())[0], tuple(target_dict.values())[0] return None - def __call__(self, pred_dict, target_dict, check=False): """ :param pred_dict: A dict from forward function of the network. diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 6216b16d..d83c4022 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -1,4 +1,3 @@ - import inspect import warnings from collections import defaultdict @@ -7,11 +6,12 @@ import torch from fastNLP.core.utils import CheckError +from fastNLP.core.utils import CheckRes from fastNLP.core.utils import _build_args from fastNLP.core.utils import _check_arg_dict_list from fastNLP.core.utils import get_func_signature from fastNLP.core.utils import seq_lens_to_masks -from fastNLP.core.utils import CheckRes + class MetricBase(object): def __init__(self): @@ -59,9 +59,10 @@ def _init_param_map(self, key_map=None, **kwargs): func_args = [arg for arg in func_spect.args if arg != 'self'] for func_param, input_param in self.param_map.items(): if func_param not in func_args: - raise NameError(f"Parameter `{func_param}` is not in {get_func_signature(self.evaluate)}. Please check the " - f"initialization parameters, or change the signature of" - f" {get_func_signature(self.evaluate)}.") + raise NameError( + f"Parameter `{func_param}` is not in {get_func_signature(self.evaluate)}. Please check the " + f"initialization parameters, or change the signature of" + f" {get_func_signature(self.evaluate)}.") # evaluate should not have varargs. if func_spect.varargs: @@ -113,7 +114,7 @@ def __call__(self, pred_dict, target_dict, check=False): if not self._checked: # 1. check consistence between signature and param_map func_spect = inspect.getfullargspec(self.evaluate) - func_args = set([arg for arg in func_spect.args if arg!='self']) + func_args = set([arg for arg in func_spect.args if arg != 'self']) for func_arg, input_arg in self.param_map.items(): if func_arg not in func_args: raise NameError(f"`{func_arg}` not in {get_func_signature(self.evaluate)}.") @@ -121,7 +122,7 @@ def __call__(self, pred_dict, target_dict, check=False): # 2. only part of the param_map are passed, left are not for arg in func_args: if arg not in self.param_map: - self.param_map[arg] = arg #This param does not need mapping. + self.param_map[arg] = arg # This param does not need mapping. self._evaluate_args = func_args self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self.param_map.items()} @@ -153,14 +154,14 @@ def __call__(self, pred_dict, target_dict, check=False): replaced_missing = list(missing) for idx, func_arg in enumerate(missing): replaced_missing[idx] = f"{self.param_map[func_arg]}" + f"(assign to `{func_arg}` " \ - f"in `{self.__class__.__name__}`)" + f"in `{self.__class__.__name__}`)" check_res = CheckRes(missing=replaced_missing, - unused=check_res.unused, - duplicated=duplicated, - required=check_res.required, - all_needed=check_res.all_needed, - varargs=check_res.varargs) + unused=check_res.unused, + duplicated=duplicated, + required=check_res.required, + all_needed=check_res.all_needed, + varargs=check_res.varargs) if check_res.missing or check_res.duplicated or check_res.varargs: raise CheckError(check_res=check_res, @@ -172,6 +173,7 @@ def __call__(self, pred_dict, target_dict, check=False): return + class AccuracyMetric(MetricBase): def __init__(self, pred=None, target=None, masks=None, seq_lens=None): super().__init__() @@ -191,7 +193,7 @@ def _fast_call_evaluate(self, pred_dict, target_dict): :param target_dict: :return: boolean, whether to go on codes in self.__call__(). When False, don't go on. """ - if len(pred_dict)==1 and len(target_dict)==1: + if len(pred_dict) == 1 and len(target_dict) == 1: pred = list(pred_dict.values())[0] target = list(target_dict.values())[0] self.evaluate(pred=pred, target=target) @@ -211,7 +213,7 @@ def evaluate(self, pred, target, masks=None, seq_lens=None): None, None, torch.Size([B], torch.Size([B]). ignored if masks are provided. :return: dict({'acc': float}) """ - #TODO 这里报错需要更改,因为pred是啥用户并不知道。需要告知用户真实的value + # TODO 这里报错需要更改,因为pred是啥用户并不知道。需要告知用户真实的value if not isinstance(pred, torch.Tensor): raise TypeError(f"`pred` in {get_func_signature(self.evaluate)} must be torch.Tensor," f"got {type(pred)}.") @@ -224,14 +226,14 @@ def evaluate(self, pred, target, masks=None, seq_lens=None): f"got {type(masks)}.") elif seq_lens is not None and not isinstance(seq_lens, torch.Tensor): raise TypeError(f"`seq_lens` in {get_func_signature(self.evaluate)} must be torch.Tensor," - f"got {type(seq_lens)}.") + f"got {type(seq_lens)}.") if masks is None and seq_lens is not None: masks = seq_lens_to_masks(seq_lens=seq_lens, float=True) - if pred.size()==target.size(): + if pred.size() == target.size(): pass - elif len(pred.size())==len(target.size())+1: + elif len(pred.size()) == len(target.size()) + 1: pred = pred.argmax(dim=-1) else: raise RuntimeError(f"In {get_func_signature(self.evaluate)}, when pred have " @@ -245,18 +247,17 @@ def evaluate(self, pred, target, masks=None, seq_lens=None): self.acc_count += torch.sum(torch.eq(pred, target).float() * masks.float()).item() self.total += torch.sum(masks.float()).item() else: - self.acc_count += torch.sum(torch.eq(pred, target).float()).item() + self.acc_count += torch.sum(torch.eq(pred, target).float()).item() self.total += np.prod(list(pred.size())) def get_metric(self, reset=True): - evaluate_result = {'acc': round(self.acc_count/self.total, 6)} + evaluate_result = {'acc': round(self.acc_count / self.total, 6)} if reset: self.acc_count = 0 self.total = 0 return evaluate_result - def _prepare_metrics(metrics): """ @@ -278,7 +279,8 @@ def _prepare_metrics(metrics): raise TypeError(f"{metric_name}.get_metric must be callable, got {type(metric.get_metric)}.") _metrics.append(metric) else: - raise TypeError(f"The type of metric in metrics must be `fastNLP.MetricBase`, not `{type(metric)}`.") + raise TypeError( + f"The type of metric in metrics must be `fastNLP.MetricBase`, not `{type(metric)}`.") elif isinstance(metrics, MetricBase): _metrics = [metrics] else: @@ -300,6 +302,7 @@ def __call__(self, predict, truth): """ raise NotImplementedError + class ClassifyEvaluator(Evaluator): def __init__(self): super(ClassifyEvaluator, self).__init__() @@ -335,6 +338,7 @@ def __call__(self, predict, truth, **_): accuracy = total_correct / total_count return {"accuracy": float(accuracy)} + class SeqLabelEvaluator2(Evaluator): # 上面的evaluator应该是错误的 def __init__(self, seq_lens_field_name='word_seq_origin_len'): @@ -367,7 +371,7 @@ def __call__(self, predict, truth, **_): if x_i in self.end_tagidx_set: truth_count += 1 for j in range(start, idx_i + 1): - if y_[j]!=x_[j]: + if y_[j] != x_[j]: flag = False break if flag: @@ -380,8 +384,7 @@ def __call__(self, predict, truth, **_): R = corr_count / (float(truth_count) + 1e-6) F = 2 * P * R / (P + R + 1e-6) - return {"P": P, 'R':R, 'F': F} - + return {"P": P, 'R': R, 'F': F} class SNLIEvaluator(Evaluator): @@ -563,10 +566,6 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary'): return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 -def classification_report(y_true, y_pred, labels=None, target_names=None, digits=2): - raise NotImplementedError - - def accuracy_topk(y_true, y_prob, k=1): """Compute accuracy of y_true matching top-k probable labels in y_prob. diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 5223bbab..dd5862d3 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -28,11 +28,9 @@ class Trainer(object): """ - def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, print_every=50, - validate_every=-1, - dev_data=None, use_cuda=False, save_path=None, - optimizer=Adam(lr=0.01, weight_decay=0), check_code_level=0, - metric_key=None): + def __init__(self, train_data, model, losser=None, metrics=None, optimizer=Adam(lr=0.01, weight_decay=0), + sampler=RandomSampler(), n_epochs=3, batch_size=32, print_every=50, validate_every=-1, dev_data=None, + use_cuda=False, metric_key=None, save_path=None, check_code_level=0): """ :param DataSet train_data: the training data @@ -54,7 +52,6 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat :: metric_key="-PPL" # language model gets better as perplexity gets smaller - :param kwargs: """ super(Trainer, self).__init__() @@ -105,6 +102,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat self.print_every = int(print_every) self.validate_every = int(validate_every) self.best_metric_indicator = None + self.sampler = sampler self._model_device = model.parameters().__next__().device @@ -120,14 +118,9 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat batch_size=self.batch_size, use_cuda=self.use_cuda) - for k, v in kwargs.items(): - setattr(self, k, v) - self.step = 0 self.start_time = None # start timestamp - # print(self.__dict__) - def train(self): """Start Training. @@ -158,7 +151,7 @@ def pass_func(*args, **kwargs): epoch = 1 while epoch <= self.n_epochs: - data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=RandomSampler(), + data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False) self._train_epoch(data_iterator, self.model, epoch, start) diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index bfbeb6e5..6c101890 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -10,6 +10,8 @@ CheckRes = namedtuple('CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', 'varargs'], verbose=False) + + def save_pickle(obj, pickle_path, file_name): """Save an object into a pickle file. @@ -53,6 +55,7 @@ def pickle_exist(pickle_path, pickle_name): else: return False + def _build_args(func, **kwargs): spect = inspect.getfullargspec(func) if spect.varkw is not None: @@ -108,7 +111,7 @@ def _check_arg_dict_list(func, args): assert callable(func) and isinstance(arg_dict_list, (list, tuple)) assert len(arg_dict_list) > 0 and isinstance(arg_dict_list[0], dict) spect = inspect.getfullargspec(func) - all_args = set([arg for arg in spect.args if arg!='self']) + all_args = set([arg for arg in spect.args if arg != 'self']) defaults = [] if spect.defaults is not None: defaults = [arg for arg in spect.defaults] @@ -130,6 +133,7 @@ def _check_arg_dict_list(func, args): all_needed=list(all_args), varargs=varargs) + def get_func_signature(func): """ @@ -153,7 +157,7 @@ def forward(self, a, b='a', **args) class_name = func.__self__.__class__.__name__ signature = inspect.signature(func) signature_str = str(signature) - if len(signature_str)>2: + if len(signature_str) > 2: _self = '(self, ' else: _self = '(self' @@ -176,12 +180,13 @@ def _is_function_or_method(func): return False return True + def _check_function_or_method(func): if not _is_function_or_method(func): raise TypeError(f"{type(func)} is not a method or function.") -def _move_dict_value_to_device(*args, device:torch.device): +def _move_dict_value_to_device(*args, device: torch.device): """ move data to model's device, element in *args should be dict. This is a inplace change. @@ -206,7 +211,8 @@ class CheckError(Exception): CheckError. Used in losses.LossBase, metrics.MetricBase. """ - def __init__(self, check_res:CheckRes, func_signature:str): + + def __init__(self, check_res: CheckRes, func_signature: str): errs = [f'The following problems occurred when calling `{func_signature}`'] if check_res.varargs: @@ -228,8 +234,9 @@ def __init__(self, check_res:CheckRes, func_signature:str): WARNING_CHECK_LEVEL = 1 STRICT_CHECK_LEVEL = 2 -def _check_loss_evaluate(prev_func_signature:str, func_signature:str, check_res:CheckRes, - pred_dict:dict, target_dict:dict, dataset, check_level=0): + +def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_res: CheckRes, + pred_dict: dict, target_dict: dict, dataset, check_level=0): errs = [] unuseds = [] _unused_field = [] @@ -268,8 +275,8 @@ def _check_loss_evaluate(prev_func_signature:str, func_signature:str, check_res: f"target is {list(target_dict.keys())}).") if _miss_out_dataset: _tmp = (f"You might need to provide {_miss_out_dataset} in DataSet and set it as target(Right now " - f"target is {list(target_dict.keys())}) or output it " - f"in {prev_func_signature}(Right now it outputs {list(pred_dict.keys())}).") + f"target is {list(target_dict.keys())}) or output it " + f"in {prev_func_signature}(Right now it outputs {list(pred_dict.keys())}).") if _unused_field: _tmp += f"You can use DataSet.rename_field() to rename the field in `unused field:`. " suggestions.append(_tmp) @@ -277,15 +284,15 @@ def _check_loss_evaluate(prev_func_signature:str, func_signature:str, check_res: if check_res.duplicated: errs.append(f"\tduplicated param: {check_res.duplicated}.") suggestions.append(f"Delete {check_res.duplicated} in the output of " - f"{prev_func_signature} or do not set {check_res.duplicated} as targets. ") + f"{prev_func_signature} or do not set {check_res.duplicated} as targets. ") if check_level == STRICT_CHECK_LEVEL: errs.extend(unuseds) - if len(errs)>0: + if len(errs) > 0: errs.insert(0, f'The following problems occurred when calling {func_signature}') sugg_str = "" - if len(suggestions)>1: + if len(suggestions) > 1: for idx, sugg in enumerate(suggestions): sugg_str += f'({idx+1}). {sugg}' else: @@ -332,10 +339,10 @@ def _check_forward_error(forward_func, batch_x, dataset, check_level): if check_level == STRICT_CHECK_LEVEL: errs.extend(_unused) - if len(errs)>0: + if len(errs) > 0: errs.insert(0, f'The following problems occurred when calling {func_signature}') sugg_str = "" - if len(suggestions)>1: + if len(suggestions) > 1: for idx, sugg in enumerate(suggestions): sugg_str += f'({idx+1}). {sugg}' else: @@ -357,11 +364,11 @@ def seq_lens_to_masks(seq_lens, float=True): :return: list, np.ndarray or torch.Tensor, shape will be (B, max_length) """ if isinstance(seq_lens, np.ndarray): - assert len(np.shape(seq_lens))==1, f"seq_lens can only have one dimension, got {len(np.shape(seq_lens))}." + assert len(np.shape(seq_lens)) == 1, f"seq_lens can only have one dimension, got {len(np.shape(seq_lens))}." assert seq_lens.dtype in (int, np.int32, np.int64), f"seq_lens can only be integer, not {seq_lens.dtype}." raise NotImplemented elif isinstance(seq_lens, torch.LongTensor): - assert len(seq_lens.size())==1, f"seq_lens can only have one dimension, got {len(seq_lens.size())==1}." + assert len(seq_lens.size()) == 1, f"seq_lens can only have one dimension, got {len(seq_lens.size())==1}." batch_size = seq_lens.size(0) max_len = seq_lens.max() indexes = torch.arange(max_len).view(1, -1).repeat(batch_size, 1).to(seq_lens.device) @@ -375,4 +382,3 @@ def seq_lens_to_masks(seq_lens, float=True): raise NotImplemented else: raise NotImplemented - diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index bc8df2d2..0a59b3cd 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -31,15 +31,7 @@ def test_case(self): model = NaiveClassifier(2, 1) - trainer = Trainer(train_set, model, - losser=BCELoss(pred="predict", target="y"), - metrics=AccuracyMetric(pred="predict", target="y"), - n_epochs=10, - batch_size=32, - print_every=10, - validate_every=-1, - dev_data=dev_set, - optimizer=SGD(0.1), - check_code_level=2 - ) + trainer = Trainer(train_set, model, losser=BCELoss(pred="predict", target="y"), + metrics=AccuracyMetric(pred="predict", target="y"), optimizer=SGD(), n_epochs=10, + batch_size=32, print_every=10, validate_every=-1, dev_data=dev_set, check_code_level=2) trainer.train() diff --git a/test/test_tutorial.py b/test/test_tutorial.py index e7ee5cf6..f3648b4f 100644 --- a/test/test_tutorial.py +++ b/test/test_tutorial.py @@ -71,20 +71,16 @@ def split_sent(ins): # 实例化Trainer,传入模型和数据,进行训练 copy_model = deepcopy(model) - overfit_trainer = Trainer(model=copy_model, train_data=test_data, dev_data=test_data, + overfit_trainer = Trainer(train_data=test_data, model=copy_model, losser=CrossEntropyLoss(pred="output", target="label_seq"), - metrics=AccuracyMetric(pred="predict", target="label_seq"), - save_path="./save", - batch_size=4, - n_epochs=10) + metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4, + dev_data=test_data, save_path="./save") overfit_trainer.train() - trainer = Trainer(model=model, train_data=train_data, dev_data=test_data, + trainer = Trainer(train_data=train_data, model=model, losser=CrossEntropyLoss(pred="output", target="label_seq"), - metrics=AccuracyMetric(pred="predict", target="label_seq"), - save_path="./save", - batch_size=4, - n_epochs=10) + metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4, + dev_data=test_data, save_path="./save") trainer.train() print('Train finished!') From ad3c5b6ef02947bb718382538d22c3407625acf5 Mon Sep 17 00:00:00 2001 From: yunfan Date: Mon, 3 Dec 2018 21:54:22 +0800 Subject: [PATCH 42/67] add magic iter in dataset --- fastNLP/core/dataset.py | 44 ++++++++++++----------- fastNLP/core/utils.py | 16 +++++++++ fastNLP/modules/encoder/char_embedding.py | 2 +- test/core/test_dataset.py | 2 +- 4 files changed, 41 insertions(+), 23 deletions(-) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 40ea0aab..dea27174 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -26,24 +26,6 @@ class DataSet(object): However, it stores data in a different way: Field-first, Instance-second. """ - - class DataSetIter(object): - def __init__(self, data_set, idx=-1, **fields): - self.data_set = data_set - self.idx = idx - self.fields = fields - - def __next__(self): - self.idx += 1 - if self.idx >= len(self.data_set): - raise StopIteration - # this returns a copy - return self.data_set[self.idx] - - def __repr__(self): - return "\n".join(['{}: {}'.format(name, repr(self.data_set[name][self.idx])) for name - in self.data_set.get_fields().keys()]) - def __init__(self, data=None): """ @@ -72,7 +54,27 @@ def __contains__(self, item): return item in self.field_arrays def __iter__(self): - return self.DataSetIter(self) + def iter_func(): + for idx in range(len(self)): + yield self[idx] + return iter_func() + + def _inner_iter(self): + class Iter_ptr: + def __init__(self, dataset, idx): + self.dataset = dataset + self.idx = idx + def __getitem__(self, item): + assert self.idx < len(self.dataset), "index:{} out of range".format(self.idx) + assert item in self.dataset.field_arrays, "no such field:{} in instance {}".format(item, self.dataset[self.idx]) + return self.dataset.field_arrays[item][self.idx] + def __repr__(self): + return self.dataset[self.idx].__repr__() + + def inner_iter_func(): + for idx in range(len(self)): + yield Iter_ptr(self, idx) + return inner_iter_func() def __getitem__(self, idx): """Fetch Instance(s) at the `idx` position(s) in the dataset. @@ -232,7 +234,7 @@ def apply(self, func, new_field_name=None, is_input=False, is_target=False): :param str new_field_name: If not None, results of the function will be stored as a new field. :return results: if new_field_name is not passed, returned values of the function over all instances. """ - results = [func(ins) for ins in self] + results = [func(ins) for ins in self._inner_iter()] if new_field_name is not None: if new_field_name in self.field_arrays: # overwrite the field, keep same attributes @@ -248,7 +250,7 @@ def apply(self, func, new_field_name=None, is_input=False, is_target=False): return results def drop(self, func): - results = [ins for ins in self if not func(ins)] + results = [ins for ins in self._inner_iter() if not func(ins)] for name, old_field in self.field_arrays.items(): self.field_arrays[name].content = [ins[name] for ins in results] diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 6c101890..abe7889c 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -382,3 +382,19 @@ def seq_lens_to_masks(seq_lens, float=True): raise NotImplemented else: raise NotImplemented + + +def seq_mask(seq_len, max_len): + """Create sequence mask. + + :param seq_len: list or torch.Tensor, the lengths of sequences in a batch. + :param max_len: int, the maximum sequence length in a batch. + :return mask: torch.LongTensor, [batch_size, max_len] + + """ + if not isinstance(seq_len, torch.Tensor): + seq_len = torch.LongTensor(seq_len) + seq_len = seq_len.view(-1, 1).long() # [batch_size, 1] + seq_range = torch.arange(start=0, end=max_len, dtype=torch.long, device=seq_len.device).view(1, -1) # [1, max_len] + return torch.gt(seq_len, seq_range) # [batch_size, max_len] + diff --git a/fastNLP/modules/encoder/char_embedding.py b/fastNLP/modules/encoder/char_embedding.py index 1ca3b5ba..249a73ad 100644 --- a/fastNLP/modules/encoder/char_embedding.py +++ b/fastNLP/modules/encoder/char_embedding.py @@ -43,7 +43,7 @@ def convolute(self, x): # [batch_size*sent_length, feature_maps[i], 1, width - kernels[i] + 1] y = torch.squeeze(y, 2) # [batch_size*sent_length, feature_maps[i], width - kernels[i] + 1] - y = F.tanh(y) + y = torch.tanh(y) y, __ = torch.max(y, 2) # [batch_size*sent_length, feature_maps[i]] feats.append(y) diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index fa3e1ea3..8ca2ed86 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -130,4 +130,4 @@ class TestDataSetIter(unittest.TestCase): def test__repr__(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) for iter in ds: - self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4], 'y': [5, 6]}") + self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4],\n'y': [5, 6]}") From 1421b7dfbabaec073e87717420b41c9c70f1539c Mon Sep 17 00:00:00 2001 From: yunfan Date: Mon, 3 Dec 2018 22:48:02 +0800 Subject: [PATCH 43/67] add this feature totally for yh --- fastNLP/core/dataset.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index dea27174..4925ac36 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -1,3 +1,4 @@ +import _pickle as pickle import numpy as np from fastNLP.core.fieldarray import FieldArray @@ -317,3 +318,12 @@ def read_csv(cls, csv_path, headers=None, sep=",", dropna=True): for header, content in zip(headers, contents): _dict[header].append(content) return cls(_dict) + + def save(self, path): + with open(path, 'wb') as f: + pickle.dump(self, f) + + @staticmethod + def load(self, path): + with open(path, 'rb') as f: + return pickle.load(f) From beb55f5288b004a89a965efb9018f31ab2a9c940 Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 3 Dec 2018 22:53:14 +0800 Subject: [PATCH 44/67] * change trainer iterating into tqdm --- fastNLP/core/dataset.py | 21 ++++-- fastNLP/core/trainer.py | 140 +++++++++++++++++++++----------------- fastNLP/core/utils.py | 2 +- test/core/test_trainer.py | 2 +- 4 files changed, 96 insertions(+), 69 deletions(-) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 749d3e74..3b5ebbbe 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -216,25 +216,36 @@ def wrapper(read_cls): return wrapper - def apply(self, func, new_field_name=None, is_input=False, is_target=False): + def apply(self, func, new_field_name=None, **kwargs): """Apply a function to every instance of the DataSet. :param func: a function that takes an instance as input. :param str new_field_name: If not None, results of the function will be stored as a new field. + :param **kwargs: Accept parameters will be + (1) is_input: boolean, will be ignored if new_field is None. If True, the new field will be as input. + (2) is_target: boolean, will be ignored if new_field is None. If True, the new field will be as target. :return results: if new_field_name is not passed, returned values of the function over all instances. """ results = [func(ins) for ins in self] + extra_param = {} + if 'is_input' in kwargs: + extra_param['is_input'] = kwargs['is_input'] + if 'is_target' in kwargs: + extra_param['is_target'] = kwargs['is_target'] if new_field_name is not None: if new_field_name in self.field_arrays: # overwrite the field, keep same attributes old_field = self.field_arrays[new_field_name] + if 'is_input' not in extra_param: + extra_param['is_input'] = old_field.is_input + if 'is_target' not in extra_param: + extra_param['is_target'] = old_field.is_target self.add_field(name=new_field_name, fields=results, padding_val=old_field.padding_val, - is_input=old_field.is_input, - is_target=old_field.is_target) + **extra_param) else: - self.add_field(name=new_field_name, fields=results, is_input=is_input, is_target=is_target) + self.add_field(name=new_field_name, fields=results, **extra_param) else: return results @@ -295,7 +306,7 @@ def read_csv(cls, csv_path, headers=None, sep=",", dropna=True): for col in headers: _dict[col] = [] for line_idx, line in enumerate(f, start_idx): - contents = line.split(sep) + contents = line.rstrip('\r\n').split(sep) if len(contents) != len(headers): if dropna: continue diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index b24af193..95749c73 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -1,7 +1,7 @@ import os import time from datetime import datetime -from datetime import timedelta +from tqdm import tqdm import torch from tensorboardX import SummaryWriter @@ -12,6 +12,7 @@ from fastNLP.core.losses import _prepare_losser from fastNLP.core.metrics import _prepare_metrics from fastNLP.core.optimizer import Adam +from fastNLP.core.sampler import BaseSampler from fastNLP.core.sampler import RandomSampler from fastNLP.core.sampler import SequentialSampler from fastNLP.core.tester import Tester @@ -28,12 +29,10 @@ class Trainer(object): """ - def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, print_every=50, - validate_every=-1, - dev_data=None, use_cuda=False, save_path=None, + def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, update_every=50, + validate_every=-1, dev_data=None, use_cuda=False, save_path=None, optimizer=Adam(lr=0.01, weight_decay=0), check_code_level=0, - metric_key=None, - **kwargs): + metric_key=None, sampler=RandomSampler()): """ :param DataSet train_data: the training data @@ -42,7 +41,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat :param MetricBase or List[MetricBase] metrics: a metric object or a list of metrics :param int n_epochs: the number of training epochs :param int batch_size: batch size for training and validation - :param int print_every: step interval to print next training information. Default: -1(no print). + :param int update_every: step interval to print next training information. Default: -1(no print). :param int validate_every: step interval to do next validation. Default: -1(validate every epoch). :param DataSet dev_data: the validation data :param use_cuda: @@ -54,8 +53,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat smaller, add a `-` character in front of the string. For example :: metric_key="-PPL" # language model gets better as perplexity gets smaller - - :param kwargs: + :param sampler: method used to generate batch data. """ super(Trainer, self).__init__() @@ -90,6 +88,10 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat # prepare loss losser = _prepare_losser(losser) + # sampler check + if not isinstance(sampler, BaseSampler): + raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler))) + if check_code_level > -1: _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, metric_key=metric_key, check_level=check_code_level) @@ -103,9 +105,10 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat self.batch_size = int(batch_size) self.use_cuda = bool(use_cuda) self.save_path = save_path - self.print_every = int(print_every) + self.print_every = int(update_every) self.validate_every = int(validate_every) self.best_metric_indicator = None + self.sampler = sampler self._model_device = model.parameters().__next__().device @@ -119,10 +122,8 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat data=self.dev_data, metrics=self.metrics, batch_size=self.batch_size, - use_cuda=self.use_cuda) - - for k, v in kwargs.items(): - setattr(self, k, v) + use_cuda=self.use_cuda, + verbose=0) self.step = 0 self.start_time = None # start timestamp @@ -140,8 +141,7 @@ def train(self): self._mode(self.model, is_test=False) - start = time.time() - self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) + self.start_time = str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print("training epochs started " + self.start_time) if self.save_path is None: class psudoSW: @@ -156,65 +156,81 @@ def pass_func(*args, **kwargs): path = os.path.join(self.save_path, 'tensorboard_logs_{}'.format(self.start_time)) self._summary_writer = SummaryWriter(path) - epoch = 1 - while epoch <= self.n_epochs: - - data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=RandomSampler(), - as_numpy=False) - - self._train_epoch(data_iterator, self.model, epoch, start) + self._tqdm_train() - # validate_every override validation at end of epochs - if self.dev_data and self.validate_every <= 0: - self._do_validation() - epoch += 1 finally: self._summary_writer.close() del self._summary_writer - def _train_epoch(self, data_iterator, model, epoch, start): - """ - - :param data_iterator: - :param model: - :param epoch: - :param start: - :return: - """ - for batch_x, batch_y in data_iterator: - # TODO 这里可能会遇到问题,万一用户在model内部修改了prediction的device就会有问题 - _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) - prediction = self._data_forward(model, batch_x) - loss = self._compute_loss(prediction, batch_y) - self._grad_backward(loss) - self._update() - self._summary_writer.add_scalar("loss", loss.item(), global_step=self.step) - for name, param in self.model.named_parameters(): - if param.requires_grad: - self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=self.step) - # self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.step) - # self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=self.step) - if self.print_every > 0 and self.step % self.print_every == 0: - end = time.time() - diff = timedelta(seconds=round(end - start)) - print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format( - epoch, self.step, loss.data, diff) - print(print_output) - - if self.validate_every > 0 and self.step % self.validate_every == 0: - self._do_validation() - - self.step += 1 + def _tqdm_train(self): + data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, + as_numpy=False) + total_steps = data_iterator.num_batches*self.n_epochs + epoch = 1 + with tqdm(total=total_steps, postfix='loss:{0:<6.5f}', desc="Epoch {}/{}" + .format(epoch, self.n_epochs), leave=False, dynamic_ncols=True) as pbar: + ava_loss = 0 + for epoch in range(1, self.n_epochs+1): + pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) + for batch_x, batch_y in data_iterator: + _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) + prediction = self._data_forward(self.model, batch_x) + loss = self._compute_loss(prediction, batch_y) + ava_loss += loss.item() + self._grad_backward(loss) + self._update() + self._summary_writer.add_scalar("loss", loss.item(), global_step=self.step) + for name, param in self.model.named_parameters(): + if param.requires_grad: + self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=self.step) + # self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.step) + # self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=self.step) + if (self.step+1) % self.print_every == 0: + pbar.update(self.print_every) + pbar.set_postfix_str("loss:{0:<6.5f}".format(ava_loss/self.print_every)) + ava_loss = 0 + + self.step += 1 + if self.validate_every > 0 and self.step % self.validate_every == 0 \ + and self.dev_data is not None: + eval_res = self._do_validation() + eval_str = "Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \ + self.tester._format_eval_results(eval_res) + pbar = self._relocate_pbar(pbar, print_str=eval_str, total=total_steps, initial=self.step) + time.sleep(0.1) + if self.validate_every < 0 and self.dev_data: + eval_res = self._do_validation() + eval_str = "Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \ + self.tester._format_eval_results(eval_res) + pbar = self._relocate_pbar(pbar, print_str=eval_str, total=total_steps, initial=self.step) + if epoch!=self.n_epochs: + data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, + as_numpy=False) + pbar.close() + + def _relocate_pbar(self, pbar, total, initial, print_str=None): + postfix = pbar.postfix + desc = pbar.desc + pbar.close() + avg_time = pbar.avg_time + start_t = pbar.start_t + if print_str: + print(print_str) + pbar = tqdm(total=total, postfix=postfix, desc=desc, leave=False, initial=initial, dynamic_ncols=True) + pbar.start_t = start_t + pbar.avg_time = avg_time + pbar.sp(pbar.__repr__()) + return pbar def _do_validation(self): res = self.tester.test() for name, num in res.items(): - pass - # self._summary_writer.add_scalar("valid_{}".format(name), num, global_step=self.step) + self._summary_writer.add_scalar("valid_{}".format(name), num, global_step=self.step) if self.save_path is not None and self._better_eval_result(res): metric_key = self.metric_key if self.metric_key is not None else "None" self._save_model(self.model, "best_" + "_".join([self.model.__class__.__name__, metric_key, self.start_time])) + return res def _mode(self, model, is_test=False): """Train mode or Test mode. This is for PyTorch currently. diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index bfbeb6e5..6d11686c 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -248,7 +248,7 @@ def _check_loss_evaluate(prev_func_signature:str, func_signature:str, check_res: if _unused_field: unuseds.append([f"\tunused field: {_unused_field}"]) if _unused_param: - unuseds.append([f"\tunused param: {_unused_param}"]) + unuseds.append([f"\tunused param: {_unused_param}"]) # output from predict or forward if check_res.missing: errs.append(f"\tmissing param: {check_res.missing}") diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index ee4a5770..5dce64a5 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -36,7 +36,7 @@ def test_case(self): metrics=AccuracyMetric(pred="predict", target="y"), n_epochs=10, batch_size=32, - print_every=10, + update_every=1, validate_every=-1, dev_data=dev_set, optimizer=SGD(0.1), From 1fb1df4a31da9204412dc6f4d3b89a0b8594a9b2 Mon Sep 17 00:00:00 2001 From: yh Date: Tue, 4 Dec 2018 10:43:40 +0800 Subject: [PATCH 45/67] =?UTF-8?q?1.=20metric=E4=BF=AE=E6=94=B9fast=5Fparam?= =?UTF-8?q?=202.=20trainer=E4=B8=ADupdate=5Fevery=E6=94=B9=E4=B8=BAprint?= =?UTF-8?q?=5Fevery,=20=E5=9B=A0=E4=B8=BAupdate=5Fevery=E5=8F=AF=E8=83=BD?= =?UTF-8?q?=E5=BC=95=E8=B5=B7optimizer=20update=E7=9A=84=E8=AF=AF=E8=A7=A3?= =?UTF-8?q?=203.=20fieldarray=20content=E6=94=AF=E6=8C=81=E4=BD=BF?= =?UTF-8?q?=E7=94=A8np.ndarray=E5=88=9D=E5=A7=8B=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/fieldarray.py | 6 + fastNLP/core/metrics.py | 73 ++++++------ fastNLP/core/trainer.py | 6 +- fastNLP/core/utils.py | 12 +- test/core/test_metrics.py | 227 ++++++++++++++++++------------------- test/core/test_trainer.py | 129 ++++++++++++++++++--- 6 files changed, 282 insertions(+), 171 deletions(-) diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index 14c52829..1b1a89c1 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -17,6 +17,12 @@ def __init__(self, name, content, padding_val=0, is_target=False, is_input=False :param bool is_input: If True, this FieldArray is used to the model input. """ self.name = name + if isinstance(content, list): + content = content + elif isinstance(content, np.ndarray): + content = content.tolist() + else: + raise TypeError("content in FieldArray can only be list or numpy.ndarray, got {}.".format(type(content))) self.content = content self.padding_val = padding_val self.is_target = is_target diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index d83c4022..ff40e4e4 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -61,8 +61,7 @@ def _init_param_map(self, key_map=None, **kwargs): if func_param not in func_args: raise NameError( f"Parameter `{func_param}` is not in {get_func_signature(self.evaluate)}. Please check the " - f"initialization parameters, or change the signature of" - f" {get_func_signature(self.evaluate)}.") + f"initialization parameters, or change its signature.") # evaluate should not have varargs. if func_spect.varargs: @@ -79,13 +78,14 @@ def _fast_param_map(self, pred_dict, target_dict): such as pred_dict has one element, target_dict has one element :param pred_dict: :param target_dict: - :return: boolean, whether to go on codes in self.__call__(). When False, don't go on. + :return: dict, if dict is not {}, pass it to self.evaluate. Otherwise do mapping. """ + fast_param = {} if len(self.param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1: return pred_dict.values[0] and target_dict.values[0] - return None + return fast_param - def __call__(self, pred_dict, target_dict, check=False): + def __call__(self, pred_dict, target_dict): """ This method will call self.evaluate method. @@ -96,20 +96,19 @@ def __call__(self, pred_dict, target_dict, check=False): (4) whether params in output_dict, target_dict are not used by evaluate.(Might cause warning) Besides, before passing params into self.evaluate, this function will filter out params from output_dict and target_dict which are not used in self.evaluate. (but if **kwargs presented in self.evaluate, no filtering - will be conducted) + will be conducted.) + This function also support _fast_param_map. :param pred_dict: usually the output of forward or prediction function :param target_dict: usually features set as target.. - :param check: boolean, if check is True, it will force check `varargs, missing, unused, duplicated`. :return: """ if not callable(self.evaluate): raise TypeError(f"{self.__class__.__name__}.evaluate has to be callable, not {type(self.evaluate)}.") - if not check: - fast_param = self._fast_param_map(pred_dict=pred_dict, target_dict=target_dict) - if fast_param is not None: - self.evaluate(*fast_param) - return + fast_param = self._fast_param_map(pred_dict=pred_dict, target_dict=target_dict) + if fast_param: + self.evaluate(**fast_param) + return if not self._checked: # 1. check consistence between signature and param_map @@ -147,7 +146,7 @@ def __call__(self, pred_dict, target_dict, check=False): duplicated.append(input_arg) # missing - if check or not self._checked: + if not self._checked: check_res = _check_arg_dict_list(self.evaluate, [mapped_pred_dict, mapped_target_dict]) # only check missing. missing = check_res.missing @@ -175,40 +174,49 @@ def __call__(self, pred_dict, target_dict, check=False): class AccuracyMetric(MetricBase): - def __init__(self, pred=None, target=None, masks=None, seq_lens=None): + def __init__(self, pred=None, target=None, seq_lens=None): super().__init__() - self._init_param_map(pred=pred, target=target, - masks=masks, seq_lens=seq_lens) + self._init_param_map(pred=pred, target=target, seq_lens=seq_lens) self.total = 0 self.acc_count = 0 - def _fast_call_evaluate(self, pred_dict, target_dict): + def _fast_param_map(self, pred_dict, target_dict): """ Only used as inner function. When the pred_dict, target is unequivocal. Don't need users to pass key_map. such as pred_dict has one element, target_dict has one element :param pred_dict: :param target_dict: - :return: boolean, whether to go on codes in self.__call__(). When False, don't go on. + :return: dict, if dict is not None, pass it to self.evaluate. Otherwise do mapping. """ - if len(pred_dict) == 1 and len(target_dict) == 1: - pred = list(pred_dict.values())[0] - target = list(target_dict.values())[0] - self.evaluate(pred=pred, target=target) - return True - return False - - def evaluate(self, pred, target, masks=None, seq_lens=None): + fast_param = {} + targets = list(target_dict.values()) + if len(targets)==1 and isinstance(targets[0], torch.Tensor): + if len(pred_dict)==1: + pred = list(pred_dict.values())[0] + fast_param['pred'] = pred + elif len(pred_dict)==2: + pred1 = list(pred_dict.values())[0] + pred2 = list(pred_dict.values())[1] + if not (isinstance(pred1, torch.Tensor) and isinstance(pred2, torch.Tensor)): + return fast_param + if len(pred1.size())>len(pred2.size()): + fast_param['pred'] = pred1 + fast_param['seq_lens'] = pred2 + else: + return fast_param + fast_param['target'] = targets[0] + return fast_param + + def evaluate(self, pred, target, seq_lens=None): """ :param pred: List of (torch.Tensor, or numpy.ndarray). Element's shape can be: torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), torch.Size([B, max_len, n_classes]) :param target: List of (torch.Tensor, or numpy.ndarray). Element's can be: torch.Size([B,]), torch.Size([B,]), torch.Size([B, max_len]), torch.Size([B, max_len]) - :param masks: List of (torch.Tensor, or numpy.ndarray). Element's can be: - None, None, torch.Size([B, max_len], torch.Size([B, max_len]) :param seq_lens: List of (torch.Tensor, or numpy.ndarray). Element's can be: None, None, torch.Size([B], torch.Size([B]). ignored if masks are provided. :return: dict({'acc': float}) @@ -221,15 +229,14 @@ def evaluate(self, pred, target, masks=None, seq_lens=None): raise TypeError(f"`target` in {get_func_signature(self.evaluate)} must be torch.Tensor," f"got {type(target)}.") - if masks is not None and not isinstance(masks, torch.Tensor): - raise TypeError(f"`masks` in {get_func_signature(self.evaluate)} must be torch.Tensor," - f"got {type(masks)}.") - elif seq_lens is not None and not isinstance(seq_lens, torch.Tensor): + if seq_lens is not None and not isinstance(seq_lens, torch.Tensor): raise TypeError(f"`seq_lens` in {get_func_signature(self.evaluate)} must be torch.Tensor," f"got {type(seq_lens)}.") - if masks is None and seq_lens is not None: + if seq_lens is not None: masks = seq_lens_to_masks(seq_lens=seq_lens, float=True) + else: + masks = None if pred.size() == target.size(): pass diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 57c79369..a0069571 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -29,7 +29,7 @@ class Trainer(object): """Main Training Loop """ - def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, update_every=50, + def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, print_every=50, validate_every=-1, dev_data=None, use_cuda=False, save_path=None, optimizer=Adam(lr=0.01, weight_decay=0), check_code_level=0, metric_key=None, sampler=RandomSampler(), use_tqdm=True): @@ -41,7 +41,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat :param MetricBase or List[MetricBase] metrics: a metric object or a list of metrics :param int n_epochs: the number of training epochs :param int batch_size: batch size for training and validation - :param int update_every: step interval to print next training information. Default: -1(no print). + :param int print_every: step interval to print next training information. Default: -1(no print). :param int validate_every: step interval to do next validation. Default: -1(validate every epoch). :param DataSet dev_data: the validation data :param use_cuda: @@ -106,7 +106,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat self.batch_size = int(batch_size) self.use_cuda = bool(use_cuda) self.save_path = save_path - self.print_every = int(update_every) + self.print_every = int(print_every) self.validate_every = int(validate_every) self.best_metric_indicator = None self.sampler = sampler diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 9fc091a7..4fd5eaec 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -214,7 +214,7 @@ class CheckError(Exception): """ def __init__(self, check_res: CheckRes, func_signature: str): - errs = [f'The following problems occurred when calling `{func_signature}`'] + errs = [f'Problems occurred when calling `{func_signature}`'] if check_res.varargs: errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, please delete it)") @@ -276,8 +276,8 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re f"target is {list(target_dict.keys())}).") if _miss_out_dataset: _tmp = (f"You might need to provide {_miss_out_dataset} in DataSet and set it as target(Right now " - f"target is {list(target_dict.keys())}) or output it " - f"in {prev_func_signature}(Right now it outputs {list(pred_dict.keys())}).") + f"target has {list(target_dict.keys())}) or output it " + f"in {prev_func_signature}(Right now output has {list(pred_dict.keys())}).") if _unused_field: _tmp += f"You can use DataSet.rename_field() to rename the field in `unused field:`. " suggestions.append(_tmp) @@ -291,7 +291,7 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re errs.extend(unuseds) if len(errs) > 0: - errs.insert(0, f'The following problems occurred when calling {func_signature}') + errs.insert(0, f'Problems occurred when calling {func_signature}') sugg_str = "" if len(suggestions) > 1: for idx, sugg in enumerate(suggestions): @@ -341,7 +341,7 @@ def _check_forward_error(forward_func, batch_x, dataset, check_level): errs.extend(_unused) if len(errs) > 0: - errs.insert(0, f'The following problems occurred when calling {func_signature}') + errs.insert(0, f'Problems occurred when calling {func_signature}') sugg_str = "" if len(suggestions) > 1: for idx, sugg in enumerate(suggestions): @@ -356,7 +356,7 @@ def _check_forward_error(forward_func, batch_x, dataset, check_level): warnings.warn(message=_unused_warn) -def seq_lens_to_masks(seq_lens, float=True): +def seq_lens_to_masks(seq_lens, float=False): """ Convert seq_lens to masks. diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py index ffc11401..1b8ae70b 100644 --- a/test/core/test_metrics.py +++ b/test/core/test_metrics.py @@ -6,131 +6,126 @@ import numpy as np class TestAccuracyMetric(unittest.TestCase): - # def test_AccuracyMetric1(self): - # # (1) only input, targets passed - # pred_dict = {"pred": torch.zeros(4, 3)} - # target_dict = {'target': torch.zeros(4)} - # metric = AccuracyMetric() - # - # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) - # print(metric.get_metric()) - # - # def test_AccuracyMetric2(self): - # # (2) with corrupted size - # try: - # pred_dict = {"pred": torch.zeros(4, 3, 2)} - # target_dict = {'target': torch.zeros(4)} - # metric = AccuracyMetric() - # - # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) - # print(metric.get_metric()) - # except Exception as e: - # print(e) - # return - # self.assertTrue(True, False), "No exception catches." - # - # def test_AccuracyMetric3(self): - # # (3) with check=False , the second batch is corrupted size - # try: - # metric = AccuracyMetric() - # pred_dict = {"pred": torch.zeros(4, 3, 2)} - # target_dict = {'target': torch.zeros(4, 3)} - # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) - # - # pred_dict = {"pred": torch.zeros(4, 3, 2)} - # target_dict = {'target': torch.zeros(4)} - # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) - # - # print(metric.get_metric()) - # except Exception as e: - # print(e) - # return - # self.assertTrue(True, False), "No exception catches." - # - # def test_AccuracyMetric4(self): - # # (4) with check=True , the second batch is corrupted size - # try: - # metric = AccuracyMetric() - # pred_dict = {"pred": torch.zeros(4, 3, 2)} - # target_dict = {'target': torch.zeros(4, 3)} - # metric(pred_dict=pred_dict, target_dict=target_dict) - # - # pred_dict = {"pred": torch.zeros(4, 3, 2)} - # target_dict = {'target': torch.zeros(4)} - # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) - # - # print(metric.get_metric()) - # - # except Exception as e: - # print(e) - # return - # self.assertTrue(True, False), "No exception catches." + def test_AccuracyMetric1(self): + # (1) only input, targets passed + pred_dict = {"pred": torch.zeros(4, 3)} + target_dict = {'target': torch.zeros(4)} + metric = AccuracyMetric() + + metric(pred_dict=pred_dict, target_dict=target_dict, ) + print(metric.get_metric()) # - # def test_AccuaryMetric5(self): - # # (5) check reset - # metric = AccuracyMetric() - # pred_dict = {"pred": torch.zeros(4, 3, 2)} - # target_dict = {'target': torch.zeros(4, 3)} - # metric(pred_dict=pred_dict, target_dict=target_dict) - # self.assertDictEqual(metric.get_metric(), {'acc': 1}) + def test_AccuracyMetric2(self): + # (2) with corrupted size + try: + pred_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4)} + metric = AccuracyMetric() + + metric(pred_dict=pred_dict, target_dict=target_dict, ) + print(metric.get_metric()) + except Exception as e: + print(e) + return + self.assertTrue(True, False), "No exception catches." # - # pred_dict = {"pred": torch.zeros(4, 3, 2)} - # target_dict = {'target': torch.zeros(4, 3)+1} - # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) - # self.assertDictEqual(metric.get_metric(), {'acc':0}) + def test_AccuracyMetric3(self): + # (3) the second batch is corrupted size + try: + metric = AccuracyMetric() + pred_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4, 3)} + metric(pred_dict=pred_dict, target_dict=target_dict) + + pred_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4)} + metric(pred_dict=pred_dict, target_dict=target_dict) + + print(metric.get_metric()) + except Exception as e: + print(e) + return + self.assertTrue(True, False), "No exception catches." + # - # def test_AccuaryMetric6(self): - # # (6) check numpy array is not acceptable - # try: - # metric = AccuracyMetric() - # pred_dict = {"pred": np.zeros((4, 3, 2))} - # target_dict = {'target': np.zeros((4, 3))} - # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) - # self.assertDictEqual(metric.get_metric(), {'acc': 1}) - # except Exception as e: - # print(e) - # return - # self.assertTrue(True, False), "No exception catches." - - # def test_AccuaryMetric7(self): - # # (7) check map, match - # metric = AccuracyMetric(pred='predictions', target='targets') - # pred_dict = {"predictions": torch.zeros(4, 3, 2)} - # target_dict = {'targets': torch.zeros(4, 3)} - # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) - # self.assertDictEqual(metric.get_metric(), {'acc': 1}) + def test_AccuaryMetric4(self): + # (5) check reset + metric = AccuracyMetric() + pred_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4, 3)} + metric(pred_dict=pred_dict, target_dict=target_dict) + self.assertDictEqual(metric.get_metric(), {'acc': 1}) + + pred_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4, 3)+1} + metric(pred_dict=pred_dict, target_dict=target_dict) + self.assertDictEqual(metric.get_metric(), {'acc':0}) + + def test_AccuaryMetric5(self): + # (5) check reset + metric = AccuracyMetric() + pred_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4, 3)} + metric(pred_dict=pred_dict, target_dict=target_dict) + self.assertDictEqual(metric.get_metric(reset=False), {'acc': 1}) + + pred_dict = {"pred": torch.zeros(4, 3, 2)} + target_dict = {'target': torch.zeros(4, 3)+1} + metric(pred_dict=pred_dict, target_dict=target_dict) + self.assertDictEqual(metric.get_metric(), {'acc':0.5}) + # - # def test_AccuaryMetric8(self): - # # (8) check map, does not match - # try: - # metric = AccuracyMetric(pred='predictions', target='targets') - # pred_dict = {"prediction": torch.zeros(4, 3, 2)} - # target_dict = {'targets': torch.zeros(4, 3)} - # metric(pred_dict=pred_dict, target_dict=target_dict, check=True) - # self.assertDictEqual(metric.get_metric(), {'acc': 1}) - # except Exception as e: - # print(e) - # return - # self.assertTrue(True, False), "No exception catches." - - # def test_AccuaryMetric9(self): - # # (9) check map, include unused - # try: - # metric = AccuracyMetric(pred='predictions', target='targets') - # pred_dict = {"prediction": torch.zeros(4, 3, 2), 'unused':1} - # target_dict = {'targets': torch.zeros(4, 3)} - # metric(pred_dict=pred_dict, target_dict=target_dict) - # self.assertDictEqual(metric.get_metric(), {'acc': 1}) - # except Exception as e: - # print(e) - # return - # self.assertTrue(True, False), "No exception catches." + def test_AccuaryMetric6(self): + # (6) check numpy array is not acceptable + try: + metric = AccuracyMetric() + pred_dict = {"pred": np.zeros((4, 3, 2))} + target_dict = {'target': np.zeros((4, 3))} + metric(pred_dict=pred_dict, target_dict=target_dict) + except Exception as e: + print(e) + return + self.assertTrue(True, False), "No exception catches." + + def test_AccuaryMetric7(self): + # (7) check map, match + metric = AccuracyMetric(pred='predictions', target='targets') + pred_dict = {"predictions": torch.zeros(4, 3, 2)} + target_dict = {'targets': torch.zeros(4, 3)} + metric(pred_dict=pred_dict, target_dict=target_dict) + self.assertDictEqual(metric.get_metric(), {'acc': 1}) + + def test_AccuaryMetric8(self): + # (8) check map, does not match. use stop_fast_param to stop fast param map + try: + metric = AccuracyMetric(pred='predictions', target='targets') + pred_dict = {"prediction": torch.zeros(4, 3, 2), "stop_fast_param":1} + target_dict = {'targets': torch.zeros(4, 3)} + metric(pred_dict=pred_dict, target_dict=target_dict, ) + self.assertDictEqual(metric.get_metric(), {'acc': 1}) + except Exception as e: + print(e) + return + self.assertTrue(True, False), "No exception catches." + + def test_AccuaryMetric9(self): + # (9) check map, include unused + try: + metric = AccuracyMetric(pred='prediction', target='targets') + pred_dict = {"prediction": torch.zeros(4, 3, 2), 'unused':1} + target_dict = {'targets': torch.zeros(4, 3)} + metric(pred_dict=pred_dict, target_dict=target_dict) + self.assertDictEqual(metric.get_metric(), {'acc': 1}) + except Exception as e: + print(e) + return + self.assertTrue(True, False), "No exception catches." def test_AccuaryMetric10(self): # (10) check _fast_metric try: metric = AccuracyMetric() - pred_dict = {"predictions": torch.zeros(4, 3, 2)} + pred_dict = {"predictions": torch.zeros(4, 3, 2), "masks": torch.zeros(4, 3)} target_dict = {'targets': torch.zeros(4, 3)} metric(pred_dict=pred_dict, target_dict=target_dict) self.assertDictEqual(metric.get_metric(), {'acc': 1}) diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index 2975f39c..ed4cc38d 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -1,6 +1,8 @@ import unittest import numpy as np +from torch import nn +import torch.nn.functional as F from fastNLP.core.dataset import DataSet from fastNLP.core.instance import Instance @@ -11,19 +13,29 @@ from fastNLP.models.base_model import NaiveClassifier -class TrainerTestGround(unittest.TestCase): - def test_case(self): - mean = np.array([-3, -3]) - cov = np.array([[1, 0], [0, 1]]) - class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) +def prepare_fake_dataset(): + mean = np.array([-3, -3]) + cov = np.array([[1, 0], [0, 1]]) + class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) - mean = np.array([3, 3]) - cov = np.array([[1, 0], [0, 1]]) - class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) + mean = np.array([3, 3]) + cov = np.array([[1, 0], [0, 1]]) + class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) - data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + - [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) + data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + + [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) + return data_set +def prepare_fake_dataset2(*args, size=100): + ys = np.random.randint(4, size=100) + data = {'y': ys} + for arg in args: + data[arg] = np.random.randn(size, 5) + return DataSet(data=data) + +class TrainerTestGround(unittest.TestCase): + def test_case(self): + data_set = prepare_fake_dataset() data_set.set_input("x", flag=True) data_set.set_target("y", flag=True) @@ -36,10 +48,101 @@ def test_case(self): metrics=AccuracyMetric(pred="predict", target="y"), n_epochs=10, batch_size=32, - update_every=1, - validate_every=10, + print_every=50, + validate_every=-1, dev_data=dev_set, optimizer=SGD(lr=0.1), check_code_level=2, use_tqdm=True) - trainer.train() \ No newline at end of file + trainer.train() + + def test_trainer_suggestion1(self): + # 检查报错提示能否正确提醒用户。 + # 这里没有传入forward需要的数据。需要trainer提醒用户如何设置。 + dataset = prepare_fake_dataset2('x') + class Model(nn.Module): + def __init__(self): + super().__init__() + self.fc = nn.Linear(5, 4) + def forward(self, x1, x2, y): + x1 = self.fc(x1) + x2 = self.fc(x2) + x = x1 + x2 + loss = F.cross_entropy(x, y) + return {'loss': loss} + + model = Model() + trainer = Trainer( + train_data=dataset, + model=model + ) + """ + # 应该获取到的报错提示 + NameError: + The following problems occurred when calling Model.forward(self, x1, x2, y) + missing param: ['y', 'x1', 'x2'] + Suggestion: (1). You might need to set ['y'] as input. + (2). You need to provide ['x1', 'x2'] in DataSet and set it as input. + + """ + + def test_trainer_suggestion2(self): + # 检查报错提示能否正确提醒用户 + # 这里传入forward需要的数据,看是否可以运行 + dataset = prepare_fake_dataset2('x1', 'x2') + dataset.set_input('x1', 'x2', 'y', flag=True) + class Model(nn.Module): + def __init__(self): + super().__init__() + self.fc = nn.Linear(5, 4) + def forward(self, x1, x2, y): + x1 = self.fc(x1) + x2 = self.fc(x2) + x = x1 + x2 + loss = F.cross_entropy(x, y) + return {'loss': loss} + + model = Model() + trainer = Trainer( + train_data=dataset, + model=model, + use_tqdm=False, + print_every=2 + ) + trainer.train() + """ + # 应该正确运行 + """ + + def test_trainer_suggestion3(self): + # 检查报错提示能否正确提醒用户 + # 这里传入forward需要的数据,但是forward没有返回loss这个key + dataset = prepare_fake_dataset2('x1', 'x2') + dataset.set_input('x1', 'x2', 'y', flag=True) + class Model(nn.Module): + def __init__(self): + super().__init__() + self.fc = nn.Linear(5, 4) + def forward(self, x1, x2, y): + x1 = self.fc(x1) + x2 = self.fc(x2) + x = x1 + x2 + loss = F.cross_entropy(x, y) + return {'wrong_loss_key': loss} + + model = Model() + trainer = Trainer( + train_data=dataset, + model=model, + use_tqdm=False, + print_every=2 + ) + trainer.train() + """ + # 应该正确运行 + """ + + + def test_case2(self): + # check metrics Wrong + data_set = prepare_fake_dataset2('x1', 'x2') From 661780b9757586d4bd56b0f8437cbc0b5d497eec Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Tue, 4 Dec 2018 10:54:09 +0800 Subject: [PATCH 46/67] Improve FieldArray. Support nested list and a list of np.array --- fastNLP/core/fieldarray.py | 90 +++++++++++++++++++++--------------- fastNLP/core/losses.py | 1 + test/core/test_fieldarray.py | 18 ++++++-- 3 files changed, 69 insertions(+), 40 deletions(-) diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index 1b1a89c1..a1ece0aa 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -11,7 +11,7 @@ def __init__(self, name, content, padding_val=0, is_target=False, is_input=False """ :param str name: the name of the FieldArray - :param list content: a list of int, float, or a list of list. + :param list content: a list of int, float, str or np.ndarray, or a list of list of one. :param int padding_val: the integer for padding. Default: 0. :param bool is_target: If True, this FieldArray is used to compute loss. :param bool is_input: If True, this FieldArray is used to the model input. @@ -27,35 +27,46 @@ def __init__(self, name, content, padding_val=0, is_target=False, is_input=False self.padding_val = padding_val self.is_target = is_target self.is_input = is_input + + self.BASIC_TYPES = (int, float, str, np.ndarray) + self.is_2d_list = False self.pytype = self._type_detection(content) self.dtype = self._map_to_np_type(self.pytype) - @staticmethod - def _type_detection(content): + def _type_detection(self, content): + """ + :param content: a list of int, float, str or np.ndarray, or a list of list of one. + :return type: one of int, float, str, np.ndarray + + """ if isinstance(content, list) and len(content) > 0 and isinstance(content[0], list): - # 2-D list - # TODO: refactor - type_set = set([type(item) for item in content[0]]) - else: - # 1-D list + # content is a 2-D list + type_set = set([self._type_detection(x) for x in content]) + if len(type_set) > 1: + raise RuntimeError("Cannot create FieldArray with more than one type. Provided {}".format(type_set)) + self.is_2d_list = True + return type_set.pop() + + elif isinstance(content, list): + # content is a 1-D list if len(content) == 0: raise RuntimeError("Cannot create FieldArray with an empty list.") type_set = set([type(item) for item in content]) - if len(type_set) == 1 and any(basic_type in type_set for basic_type in (str, int, float)): - return type_set.pop() - elif len(type_set) == 2 and float in type_set and int in type_set: - # up-cast int to float - for idx, _ in enumerate(content): - content[idx] = float(content[idx]) - return float + if len(type_set) == 1 and tuple(type_set)[0] in self.BASIC_TYPES: + return type_set.pop() + elif len(type_set) == 2 and float in type_set and int in type_set: + # up-cast int to float + return float + else: + raise RuntimeError("Cannot create FieldArray with type {}".format(*type_set)) else: - raise ValueError("Unsupported type conversion detected in FieldArray: {}".format(*type_set)) + raise RuntimeError("Cannot create FieldArray with type {}".format(type(content))) @staticmethod def _map_to_np_type(basic_type): - type_mapping = {int: np.int64, float: np.float64, str: np.str} + type_mapping = {int: np.int64, float: np.float64, str: np.str, np.ndarray: np.ndarray} return type_mapping[basic_type] def __repr__(self): @@ -64,29 +75,35 @@ def __repr__(self): def append(self, val): """Add a new item to the tail of FieldArray. - :param val: int, float, str, or a list of them. + :param val: int, float, str, or a list of one. """ val_type = type(val) - if val_type is int and self.pytype is float: - # up-cast the appended value - val = float(val) - elif val_type is float and self.pytype is int: - # up-cast all other values in the content - for idx, _ in enumerate(self.content): - self.content[idx] = float(self.content[idx]) - self.pytype = float - self.dtype = self._map_to_np_type(self.pytype) - elif val_type is list: + if val_type == list: # shape check + if self.is_2d_list is False: + raise RuntimeError("Cannot append a list into a 1-D FieldArray. Please provide an element.") if len(val) == 0: - raise ValueError("Cannot append an empty list.") + raise RuntimeError("Cannot append an empty list.") + val_list_type = [type(_) for _ in val] # type check + if len(val_list_type) == 2 and int in val_list_type and float in val_list_type: + # up-cast int to float + val_type = float + elif len(val_list_type) == 1: + val_type = val_list_type[0] else: - if type(val[0]) != self.pytype: - raise ValueError( - "Cannot append a list of {}-type value into a {}-tpye FieldArray.". - format(type(val[0]), self.pytype)) - elif val_type != self.pytype: - raise ValueError("Cannot append a {}-type value into a {}-tpye FieldArray.".format(val_type, self.pytype)) - + raise RuntimeError("Cannot append a list of {}".format(val_list_type)) + else: + if self.is_2d_list is True: + raise RuntimeError("Cannot append a non-list into a 2-D list. Please provide a list.") + if val_type == float and self.pytype == int: + # up-cast + self.pytype = float + self.dtype = self._map_to_np_type(self.pytype) + elif val_type == int and self.pytype == float: + pass + elif val_type == self.pytype: + pass + else: + raise RuntimeError("Cannot append type {} into type {}".format(val_type, self.pytype)) self.content.append(val) def __getitem__(self, indices): @@ -102,7 +119,6 @@ def get(self, indices): :param indices: an int, or a list of int. :return: """ - # TODO: 返回行为不一致,有隐患 if isinstance(indices, int): return self.content[indices] assert self.is_input is True or self.is_target is True diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index f2fb16d0..af3d2ef0 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -126,6 +126,7 @@ def __call__(self, pred_dict, target_dict, check=False): for keys, val in target_dict.items(): param_val_dict.update({keys: val}) + # TODO: use the origin key to raise error if not self._checked: for keys in args: if param_map[keys] not in param_val_dict.keys(): diff --git a/test/core/test_fieldarray.py b/test/core/test_fieldarray.py index 883e1136..0264c2ff 100644 --- a/test/core/test_fieldarray.py +++ b/test/core/test_fieldarray.py @@ -24,19 +24,31 @@ def test(self): def test_type_conversion(self): fa = FieldArray("x", [1.2, 2.2, 3, 4, 5], is_input=True) self.assertEqual(fa.pytype, float) - self.assertEqual(fa.dtype, np.double) + self.assertEqual(fa.dtype, np.float64) fa = FieldArray("x", [1, 2, 3, 4, 5], is_input=True) fa.append(1.3333) self.assertEqual(fa.pytype, float) - self.assertEqual(fa.dtype, np.double) + self.assertEqual(fa.dtype, np.float64) fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=False) fa.append(10) self.assertEqual(fa.pytype, float) - self.assertEqual(fa.dtype, np.double) + self.assertEqual(fa.dtype, np.float64) fa = FieldArray("y", ["a", "b", "c", "d"], is_input=False) fa.append("e") self.assertEqual(fa.dtype, np.str) self.assertEqual(fa.pytype, str) + + def test_support_np_array(self): + fa = FieldArray("y", [np.array([1.1, 2.2, 3.3, 4.4, 5.5])], is_input=False) + self.assertEqual(fa.dtype, np.ndarray) + + fa.append(np.array([1.1, 2.2, 3.3, 4.4, 5.5])) + self.assertEqual(fa.pytype, np.ndarray) + + def test_nested_list(self): + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.1, 2.2, 3.3, 4.4, 5.5]], is_input=False) + self.assertEqual(fa.pytype, float) + self.assertEqual(fa.dtype, np.float64) From 4b099bb0ddee13e3414a18f1eccd19ecd9286248 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Tue, 4 Dec 2018 11:16:24 +0800 Subject: [PATCH 47/67] * add tqdm in requirements.txt * fix FieldArray type check bugs --- fastNLP/core/fieldarray.py | 4 ++-- requirements.txt | 1 + test/core/test_trainer.py | 24 ++++++++++++++++++------ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index a1ece0aa..0a94b26c 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -83,12 +83,12 @@ def append(self, val): raise RuntimeError("Cannot append a list into a 1-D FieldArray. Please provide an element.") if len(val) == 0: raise RuntimeError("Cannot append an empty list.") - val_list_type = [type(_) for _ in val] # type check + val_list_type = set([type(_) for _ in val]) # type check if len(val_list_type) == 2 and int in val_list_type and float in val_list_type: # up-cast int to float val_type = float elif len(val_list_type) == 1: - val_type = val_list_type[0] + val_type = val_list_type.pop() else: raise RuntimeError("Cannot append a list of {}".format(val_list_type)) else: diff --git a/requirements.txt b/requirements.txt index 91a3f040..60ab7849 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ numpy>=1.14.2 torch>=0.4.0 tensorboardX +tqdm \ No newline at end of file diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index ed4cc38d..2b14aa11 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -1,8 +1,8 @@ import unittest import numpy as np -from torch import nn import torch.nn.functional as F +from torch import nn from fastNLP.core.dataset import DataSet from fastNLP.core.instance import Instance @@ -26,6 +26,7 @@ def prepare_fake_dataset(): [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) return data_set + def prepare_fake_dataset2(*args, size=100): ys = np.random.randint(4, size=100) data = {'y': ys} @@ -33,6 +34,7 @@ def prepare_fake_dataset2(*args, size=100): data[arg] = np.random.randn(size, 5) return DataSet(data=data) + class TrainerTestGround(unittest.TestCase): def test_case(self): data_set = prepare_fake_dataset() @@ -55,15 +57,20 @@ def test_case(self): check_code_level=2, use_tqdm=True) trainer.train() + """ + # 应该正确运行 + """ def test_trainer_suggestion1(self): # 检查报错提示能否正确提醒用户。 # 这里没有传入forward需要的数据。需要trainer提醒用户如何设置。 dataset = prepare_fake_dataset2('x') + class Model(nn.Module): def __init__(self): super().__init__() self.fc = nn.Linear(5, 4) + def forward(self, x1, x2, y): x1 = self.fc(x1) x2 = self.fc(x2) @@ -72,10 +79,12 @@ def forward(self, x1, x2, y): return {'loss': loss} model = Model() - trainer = Trainer( - train_data=dataset, - model=model - ) + + with self.assertRaises(NameError): + trainer = Trainer( + train_data=dataset, + model=model + ) """ # 应该获取到的报错提示 NameError: @@ -91,10 +100,12 @@ def test_trainer_suggestion2(self): # 这里传入forward需要的数据,看是否可以运行 dataset = prepare_fake_dataset2('x1', 'x2') dataset.set_input('x1', 'x2', 'y', flag=True) + class Model(nn.Module): def __init__(self): super().__init__() self.fc = nn.Linear(5, 4) + def forward(self, x1, x2, y): x1 = self.fc(x1) x2 = self.fc(x2) @@ -119,10 +130,12 @@ def test_trainer_suggestion3(self): # 这里传入forward需要的数据,但是forward没有返回loss这个key dataset = prepare_fake_dataset2('x1', 'x2') dataset.set_input('x1', 'x2', 'y', flag=True) + class Model(nn.Module): def __init__(self): super().__init__() self.fc = nn.Linear(5, 4) + def forward(self, x1, x2, y): x1 = self.fc(x1) x2 = self.fc(x2) @@ -142,7 +155,6 @@ def forward(self, x1, x2, y): # 应该正确运行 """ - def test_case2(self): # check metrics Wrong data_set = prepare_fake_dataset2('x1', 'x2') From a1a41c2d8b0df658fc0067fb37f3a0eb16db36e8 Mon Sep 17 00:00:00 2001 From: yh Date: Tue, 4 Dec 2018 12:58:56 +0800 Subject: [PATCH 48/67] =?UTF-8?q?1.=20unused=E6=8A=A5=E9=94=99=E8=BF=90?= =?UTF-8?q?=E8=A1=8C=E9=94=99=E8=AF=AF=E4=BF=AE=E5=A4=8D=202.=20loss?= =?UTF-8?q?=E4=B8=AD=E4=BF=AE=E5=A4=8D=E4=B8=80=E4=B8=AA=E9=94=99=E8=AF=AF?= =?UTF-8?q?=203.=20metric=E4=B8=ADfast=5Fparam=E8=B0=83=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/losses.py | 6 +-- fastNLP/core/metrics.py | 14 +++++-- fastNLP/core/trainer.py | 2 + fastNLP/core/utils.py | 22 ++++++----- test/core/test_trainer.py | 79 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 108 insertions(+), 15 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index f2fb16d0..76e9be0d 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -147,7 +147,7 @@ def __call__(self, pred_dict, target_dict, check=False): if not (isinstance(loss, torch.Tensor) and len(loss.size()) == 0): if not isinstance(loss, torch.Tensor): raise RuntimeError(f"loss ERROR: loss except a torch.Tensor but get {type(loss)}") - raise RuntimeError(f"loss ERROR: the size of loss except torch.Size([]) but got {loss.size}") + raise RuntimeError(f"loss ERROR: the size of loss except torch.Size([]) but got {loss.size()}") return loss @@ -219,8 +219,8 @@ def __call__(self, pred_dict, target_dict, check=False): if not (isinstance(loss, torch.Tensor) and len(loss.size()) == 0): if not isinstance(loss, torch.Tensor): - raise TypeError(f"loss ERROR: loss except a torch.Tensor but got {type(loss)}") - raise RuntimeError(f"loss ERROR: the size of loss except torch.Size([]) but got {loss.size}") + raise TypeError(f"loss excepts to be a torch.Tensor, got {type(loss)}") + raise RuntimeError(f"The size of loss excepts to be torch.Size([]), got {loss.size()}") return loss diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index ff40e4e4..c17d408b 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -202,12 +202,20 @@ def _fast_param_map(self, pred_dict, target_dict): pred2 = list(pred_dict.values())[1] if not (isinstance(pred1, torch.Tensor) and isinstance(pred2, torch.Tensor)): return fast_param - if len(pred1.size())>len(pred2.size()): - fast_param['pred'] = pred1 - fast_param['seq_lens'] = pred2 + if len(pred1.size())len(pred2.size()) and len(pred2.size())==1: + seq_lens = pred2 + pred = pred1 + else: + return fast_param + fast_param['pred'] = pred + fast_param['seq_lens'] = seq_lens else: return fast_param fast_param['target'] = targets[0] + # TODO need to make sure they all have same batch_size return fast_param def evaluate(self, pred, target, seq_lens=None): diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index a0069571..13a3490a 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -48,6 +48,8 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat :param str save_path: file path to save models :param Optimizer optimizer: an optimizer object :param int check_code_level: level of FastNLP code checker. -1: don't check, 0: ignore. 1: warning. 2: strict. + `ignore` will not check unused field; `warning` when warn if some field are not used; `strict` means + it will raise error if some field are not used. :param str metric_key: a single indicator used to decide the best model based on metric results. It must be one of the keys returned by the FIRST metric in `metrics`. If the overall result gets better if the indicator gets smaller, add a `-` character in front of the string. For example diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 4fd5eaec..0019b022 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -254,9 +254,9 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re else: _unused_param.append(_unused) if _unused_field: - unuseds.append([f"\tunused field: {_unused_field}"]) + unuseds.append(f"\tunused field: {_unused_field}") if _unused_param: - unuseds.append([f"\tunused param: {_unused_param}"]) # output from predict or forward + unuseds.append(f"\tunused param: {_unused_param}") # output from predict or forward if check_res.missing: errs.append(f"\tmissing param: {check_res.missing}") @@ -278,8 +278,8 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re _tmp = (f"You might need to provide {_miss_out_dataset} in DataSet and set it as target(Right now " f"target has {list(target_dict.keys())}) or output it " f"in {prev_func_signature}(Right now output has {list(pred_dict.keys())}).") - if _unused_field: - _tmp += f"You can use DataSet.rename_field() to rename the field in `unused field:`. " + # if _unused_field: + # _tmp += f"You can use DataSet.rename_field() to rename the field in `unused field:`. " suggestions.append(_tmp) if check_res.duplicated: @@ -287,7 +287,9 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re suggestions.append(f"Delete {check_res.duplicated} in the output of " f"{prev_func_signature} or do not set {check_res.duplicated} as targets. ") - if check_level == STRICT_CHECK_LEVEL: + if len(errs)>0: + errs.extend(unuseds) + elif check_level == STRICT_CHECK_LEVEL: errs.extend(unuseds) if len(errs) > 0: @@ -330,14 +332,16 @@ def _check_forward_error(forward_func, batch_x, dataset, check_level): suggestions.append(f"You might need to set {_miss_in_dataset} as input. ") if _miss_out_dataset: _tmp = f"You need to provide {_miss_out_dataset} in DataSet and set it as input. " - if check_res.unused: - _tmp += f"Or you might find it is in `unused field:`, you can use DataSet.rename_field() to " \ - f"rename the field in `unused field:`." + # if check_res.unused: + # _tmp += f"Or you might find it in `unused field:`, you can use DataSet.rename_field() to " \ + # f"rename the field in `unused field:`." suggestions.append(_tmp) if check_res.unused: _unused = [f"\tunused field: {check_res.unused}"] - if check_level == STRICT_CHECK_LEVEL: + if len(errs)>0: + errs.extend(_unused) + elif check_level == STRICT_CHECK_LEVEL: errs.extend(_unused) if len(errs) > 0: diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index ed4cc38d..fb6d02f8 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -7,6 +7,7 @@ from fastNLP.core.dataset import DataSet from fastNLP.core.instance import Instance from fastNLP.core.losses import BCELoss +from fastNLP.core.losses import LossInForward from fastNLP.core.metrics import AccuracyMetric from fastNLP.core.optimizer import SGD from fastNLP.core.trainer import Trainer @@ -142,6 +143,84 @@ def forward(self, x1, x2, y): # 应该正确运行 """ + def test_trainer_suggestion4(self): + # 检查报错提示能否正确提醒用户 + # 这里传入forward需要的数据,是否可以正确提示unused + dataset = prepare_fake_dataset2('x1', 'x_unused') + dataset.set_input('x1', 'x_unused', 'y', flag=True) + class Model(nn.Module): + def __init__(self): + super().__init__() + self.fc = nn.Linear(5, 4) + def forward(self, x1, x2, y): + x1 = self.fc(x1) + x2 = self.fc(x2) + x = x1 + x2 + loss = F.cross_entropy(x, y) + return {'loss': loss} + + model = Model() + trainer = Trainer( + train_data=dataset, + model=model, + use_tqdm=False, + print_every=2 + ) + + def test_trainer_suggestion5(self): + # 检查报错提示能否正确提醒用户 + # 这里传入多余参数,让其duplicate, 但这里因为y不会被调用,所以其实不会报错 + dataset = prepare_fake_dataset2('x1', 'x_unused') + dataset.rename_field('x_unused', 'x2') + dataset.set_input('x1', 'x2', 'y') + dataset.set_target('y') + class Model(nn.Module): + def __init__(self): + super().__init__() + self.fc = nn.Linear(5, 4) + def forward(self, x1, x2, y): + x1 = self.fc(x1) + x2 = self.fc(x2) + x = x1 + x2 + loss = F.cross_entropy(x, y) + return {'loss': loss} + + model = Model() + trainer = Trainer( + train_data=dataset, + model=model, + use_tqdm=False, + print_every=2 + ) + + def test_trainer_suggestion6(self): + # 检查报错提示能否正确提醒用户 + # 这里传入多余参数,让其duplicate + dataset = prepare_fake_dataset2('x1', 'x_unused') + dataset.rename_field('x_unused', 'x2') + dataset.set_input('x1', 'x2', 'y') + dataset.set_target('x1') + class Model(nn.Module): + def __init__(self): + super().__init__() + self.fc = nn.Linear(5, 4) + def forward(self, x1, x2, y): + x1 = self.fc(x1) + x2 = self.fc(x2) + x = x1 + x2 + loss = F.cross_entropy(x, y) + return {'pred': x} + + model = Model() + trainer = Trainer( + train_data=dataset, + model=model, + dev_data=dataset, + metrics=AccuracyMetric(), + use_tqdm=False, + print_every=2 + ) + def test_case2(self): # check metrics Wrong From 9acdb54fc8262f53913f08e058378f5fb0105d77 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Tue, 4 Dec 2018 14:17:31 +0800 Subject: [PATCH 49/67] =?UTF-8?q?=E4=BC=98=E5=8C=96loss=E5=9C=A8missing?= =?UTF-8?q?=E5=92=8Cduplicate=E6=97=B6=E6=8A=A5=E9=94=99=E7=9A=84=E4=BF=A1?= =?UTF-8?q?=E6=81=AF:=E8=BF=94=E5=9B=9Eloss=E5=88=9D=E5=A7=8B=E5=8C=96?= =?UTF-8?q?=E7=BA=A6=E5=AE=9A=E6=8E=A5=E5=8F=97=E7=9A=84key?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/losses.py | 5 ++--- test/core/test_loss.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 698cefb3..c1e8de0e 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -118,7 +118,7 @@ def __call__(self, pred_dict, target_dict, check=False): if not self._checked: for keys, val in pred_dict.items(): if keys in target_dict.keys(): - duplicated.append(keys) + duplicated.append(param_map[keys]) param_val_dict = {} for keys, val in pred_dict.items(): @@ -126,11 +126,10 @@ def __call__(self, pred_dict, target_dict, check=False): for keys, val in target_dict.items(): param_val_dict.update({keys: val}) - # TODO: use the origin key to raise error if not self._checked: for keys in args: if param_map[keys] not in param_val_dict.keys(): - missing.append(keys) + missing.append(param_map[keys]) if len(duplicated) > 0 or len(missing) > 0: raise CheckError( diff --git a/test/core/test_loss.py b/test/core/test_loss.py index 9b77d0a1..429a97e0 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -300,3 +300,13 @@ def test_NLLLoss(self): b = torch.tensor([1, 0, 4]) ans = l1({"my_predict": a}, {"my_truth": b}) self.assertEqual(ans, torch.nn.functional.nll_loss(a, b)) + + def test_check_error(self): + l1 = loss.NLLLoss(pred="my_predict", target="my_truth") + a = F.log_softmax(torch.randn(3, 5, requires_grad=False), dim=0) + b = torch.tensor([1, 0, 4]) + with self.assertRaises(Exception): + ans = l1({"wrong_predict": a, "my": b}, {"my_truth": b}) + + with self.assertRaises(Exception): + ans = l1({"my_predict": a}, {"truth": b, "my": a}) From 5edd9de84178db51c7492da86d76f3468092bde3 Mon Sep 17 00:00:00 2001 From: yunfan Date: Tue, 4 Dec 2018 15:49:01 +0800 Subject: [PATCH 50/67] fix bugs --- fastNLP/core/dataset.py | 2 +- fastNLP/models/cnn_text_classification.py | 23 ----------------------- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index bc4dcf57..cdca4356 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -67,8 +67,8 @@ def __init__(self, dataset, idx): self.dataset = dataset self.idx = idx def __getitem__(self, item): - assert self.idx < len(self.dataset), "index:{} out of range".format(self.idx) assert item in self.dataset.field_arrays, "no such field:{} in instance {}".format(item, self.dataset[self.idx]) + assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx) return self.dataset.field_arrays[item][self.idx] def __repr__(self): return self.dataset[self.idx].__repr__() diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py index 9aa07e66..c8fe5181 100644 --- a/fastNLP/models/cnn_text_classification.py +++ b/fastNLP/models/cnn_text_classification.py @@ -33,7 +33,6 @@ def __init__(self, embed_num, padding=padding) self.dropout = nn.Dropout(dropout) self.fc = encoder.Linear(sum(kernel_nums), num_classes) - self._loss = nn.CrossEntropyLoss() def forward(self, word_seq): """ @@ -56,25 +55,3 @@ def predict(self, word_seq): output = self(word_seq) _, predict = output['output'].max(dim=1) return {'predict': predict} - - def get_loss(self, output, label_seq): - """ - - :param output: output of forward(), [batch_size, seq_len] - :param label_seq: true label in DataSet, [batch_size, seq_len] - :return loss: torch.Tensor - """ - return self._loss(output, label_seq) - - def evaluate(self, predict, label_seq): - """ - - :param predict: iterable predict tensors - :param label_seq: iterable true label tensors - :return accuracy: dict of float - """ - predict, label_seq = torch.stack(tuple(predict), dim=0), torch.stack(tuple(label_seq), dim=0) - predict, label_seq = predict.squeeze(), label_seq.squeeze() - correct = (predict == label_seq).long().sum().item() - total = label_seq.size(0) - return {'acc': 1.0 * correct / total} From 27833d06ae7ab67480e1b43df05ffbc092d86244 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Tue, 4 Dec 2018 16:13:20 +0800 Subject: [PATCH 51/67] FieldArray only check type when is_input or is_target is set. --- fastNLP/core/fieldarray.py | 110 +++++++++++++++++++++++------------ test/core/test_fieldarray.py | 23 ++++++++ test/core/test_metrics.py | 31 +++++----- 3 files changed, 111 insertions(+), 53 deletions(-) diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index 0a94b26c..2340cd13 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -7,11 +7,11 @@ class FieldArray(object): """ - def __init__(self, name, content, padding_val=0, is_target=False, is_input=False): + def __init__(self, name, content, padding_val=0, is_target=None, is_input=None): """ :param str name: the name of the FieldArray - :param list content: a list of int, float, str or np.ndarray, or a list of list of one. + :param list content: a list of int, float, str or np.ndarray, or a list of list of one, or a np.ndarray. :param int padding_val: the integer for padding. Default: 0. :param bool is_target: If True, this FieldArray is used to compute loss. :param bool is_input: If True, this FieldArray is used to the model input. @@ -20,18 +20,44 @@ def __init__(self, name, content, padding_val=0, is_target=False, is_input=False if isinstance(content, list): content = content elif isinstance(content, np.ndarray): - content = content.tolist() + content = content.tolist() # convert np.ndarray into 2-D list else: raise TypeError("content in FieldArray can only be list or numpy.ndarray, got {}.".format(type(content))) self.content = content self.padding_val = padding_val - self.is_target = is_target - self.is_input = is_input + + self._is_target = None + self._is_input = None self.BASIC_TYPES = (int, float, str, np.ndarray) self.is_2d_list = False - self.pytype = self._type_detection(content) + self.pytype = None # int, float, str, or np.ndarray + self.dtype = None # np.int64, np.float64, np.str + + if is_input is not None: + self.is_input = is_input + if is_target is not None: + self.is_target = is_target + + @property + def is_input(self): + return self._is_input + + @is_input.setter + def is_input(self, value): + self.pytype = self._type_detection(self.content) + self.dtype = self._map_to_np_type(self.pytype) + self._is_input = value + + @property + def is_target(self): + return self._is_target + + @is_target.setter + def is_target(self, value): + self.pytype = self._type_detection(self.content) self.dtype = self._map_to_np_type(self.pytype) + self._is_target = value def _type_detection(self, content): """ @@ -42,9 +68,13 @@ def _type_detection(self, content): """ if isinstance(content, list) and len(content) > 0 and isinstance(content[0], list): # content is a 2-D list + if not all(isinstance(_, list) for _ in content): # strict check 2-D list + raise TypeError("Please provide 2-D list.") type_set = set([self._type_detection(x) for x in content]) - if len(type_set) > 1: - raise RuntimeError("Cannot create FieldArray with more than one type. Provided {}".format(type_set)) + if len(type_set) == 2 and int in type_set and float in type_set: + type_set = {float} + elif len(type_set) > 1: + raise TypeError("Cannot create FieldArray with more than one type. Provided {}".format(type_set)) self.is_2d_list = True return type_set.pop() @@ -60,9 +90,9 @@ def _type_detection(self, content): # up-cast int to float return float else: - raise RuntimeError("Cannot create FieldArray with type {}".format(*type_set)) + raise TypeError("Cannot create FieldArray with type {}".format(*type_set)) else: - raise RuntimeError("Cannot create FieldArray with type {}".format(type(content))) + raise TypeError("Cannot create FieldArray with type {}".format(type(content))) @staticmethod def _map_to_np_type(basic_type): @@ -77,33 +107,38 @@ def append(self, val): :param val: int, float, str, or a list of one. """ - val_type = type(val) - if val_type == list: # shape check - if self.is_2d_list is False: - raise RuntimeError("Cannot append a list into a 1-D FieldArray. Please provide an element.") - if len(val) == 0: - raise RuntimeError("Cannot append an empty list.") - val_list_type = set([type(_) for _ in val]) # type check - if len(val_list_type) == 2 and int in val_list_type and float in val_list_type: - # up-cast int to float - val_type = float - elif len(val_list_type) == 1: - val_type = val_list_type.pop() + if self.is_target is True or self.is_input is True: + # only check type when used as target or input + + val_type = type(val) + if val_type == list: # shape check + if self.is_2d_list is False: + raise RuntimeError("Cannot append a list into a 1-D FieldArray. Please provide an element.") + if len(val) == 0: + raise RuntimeError("Cannot append an empty list.") + val_list_type = set([type(_) for _ in val]) # type check + if len(val_list_type) == 2 and int in val_list_type and float in val_list_type: + # up-cast int to float + val_type = float + elif len(val_list_type) == 1: + val_type = val_list_type.pop() + else: + raise TypeError("Cannot append a list of {}".format(val_list_type)) else: - raise RuntimeError("Cannot append a list of {}".format(val_list_type)) - else: - if self.is_2d_list is True: - raise RuntimeError("Cannot append a non-list into a 2-D list. Please provide a list.") - if val_type == float and self.pytype == int: - # up-cast - self.pytype = float - self.dtype = self._map_to_np_type(self.pytype) - elif val_type == int and self.pytype == float: - pass - elif val_type == self.pytype: - pass - else: - raise RuntimeError("Cannot append type {} into type {}".format(val_type, self.pytype)) + if self.is_2d_list is True: + raise RuntimeError("Cannot append a non-list into a 2-D list. Please provide a list.") + + if val_type == float and self.pytype == int: + # up-cast + self.pytype = float + self.dtype = self._map_to_np_type(self.pytype) + elif val_type == int and self.pytype == float: + pass + elif val_type == self.pytype: + pass + else: + raise TypeError("Cannot append type {} into type {}".format(val_type, self.pytype)) + self.content.append(val) def __getitem__(self, indices): @@ -121,7 +156,8 @@ def get(self, indices): """ if isinstance(indices, int): return self.content[indices] - assert self.is_input is True or self.is_target is True + if self.is_input is False and self.is_target is False: + raise RuntimeError("Please specify either is_input or is_target is True for {}".format(self.name)) batch_size = len(indices) # TODO 当这个fieldArray是seq_length这种只有一位的内容时,不需要padding,需要再讨论一下 if not is_iterable(self.content[0]): diff --git a/test/core/test_fieldarray.py b/test/core/test_fieldarray.py index 0264c2ff..c22bac5b 100644 --- a/test/core/test_fieldarray.py +++ b/test/core/test_fieldarray.py @@ -44,11 +44,34 @@ def test_type_conversion(self): def test_support_np_array(self): fa = FieldArray("y", [np.array([1.1, 2.2, 3.3, 4.4, 5.5])], is_input=False) self.assertEqual(fa.dtype, np.ndarray) + self.assertEqual(fa.pytype, np.ndarray) fa.append(np.array([1.1, 2.2, 3.3, 4.4, 5.5])) + self.assertEqual(fa.dtype, np.ndarray) self.assertEqual(fa.pytype, np.ndarray) + fa = FieldArray("my_field", np.random.rand(3, 5), is_input=False) + # in this case, pytype is actually a float. We do not care about it. + self.assertEqual(fa.dtype, np.float64) + def test_nested_list(self): fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.1, 2.2, 3.3, 4.4, 5.5]], is_input=False) self.assertEqual(fa.pytype, float) self.assertEqual(fa.dtype, np.float64) + + def test_getitem_v1(self): + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) + self.assertEqual(fa[0], [1.1, 2.2, 3.3, 4.4, 5.5]) + ans = fa[[0, 1]] + self.assertTrue(isinstance(ans, np.ndarray)) + self.assertTrue(isinstance(ans[0], np.ndarray)) + self.assertEqual(ans[0].tolist(), [1.1, 2.2, 3.3, 4.4, 5.5]) + self.assertEqual(ans[1].tolist(), [1, 2, 3, 4, 5]) + self.assertEqual(ans.dtype, np.float64) + + def test_getitem_v2(self): + x = np.random.rand(10, 5) + fa = FieldArray("my_field", x, is_input=True) + indices = [0, 1, 3, 4, 6] + for a, b in zip(fa[indices], x[indices]): + self.assertListEqual(a.tolist(), b.tolist()) diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py index 1b8ae70b..76352aba 100644 --- a/test/core/test_metrics.py +++ b/test/core/test_metrics.py @@ -1,9 +1,10 @@ - import unittest -from fastNLP.core.metrics import AccuracyMetric -import torch import numpy as np +import torch + +from fastNLP.core.metrics import AccuracyMetric + class TestAccuracyMetric(unittest.TestCase): def test_AccuracyMetric1(self): @@ -12,9 +13,9 @@ def test_AccuracyMetric1(self): target_dict = {'target': torch.zeros(4)} metric = AccuracyMetric() - metric(pred_dict=pred_dict, target_dict=target_dict, ) + metric(pred_dict=pred_dict, target_dict=target_dict, ) print(metric.get_metric()) - # + def test_AccuracyMetric2(self): # (2) with corrupted size try: @@ -22,13 +23,13 @@ def test_AccuracyMetric2(self): target_dict = {'target': torch.zeros(4)} metric = AccuracyMetric() - metric(pred_dict=pred_dict, target_dict=target_dict, ) + metric(pred_dict=pred_dict, target_dict=target_dict, ) print(metric.get_metric()) except Exception as e: print(e) return self.assertTrue(True, False), "No exception catches." - # + def test_AccuracyMetric3(self): # (3) the second batch is corrupted size try: @@ -47,7 +48,6 @@ def test_AccuracyMetric3(self): return self.assertTrue(True, False), "No exception catches." - # def test_AccuaryMetric4(self): # (5) check reset metric = AccuracyMetric() @@ -57,9 +57,9 @@ def test_AccuaryMetric4(self): self.assertDictEqual(metric.get_metric(), {'acc': 1}) pred_dict = {"pred": torch.zeros(4, 3, 2)} - target_dict = {'target': torch.zeros(4, 3)+1} + target_dict = {'target': torch.zeros(4, 3) + 1} metric(pred_dict=pred_dict, target_dict=target_dict) - self.assertDictEqual(metric.get_metric(), {'acc':0}) + self.assertDictEqual(metric.get_metric(), {'acc': 0}) def test_AccuaryMetric5(self): # (5) check reset @@ -70,11 +70,10 @@ def test_AccuaryMetric5(self): self.assertDictEqual(metric.get_metric(reset=False), {'acc': 1}) pred_dict = {"pred": torch.zeros(4, 3, 2)} - target_dict = {'target': torch.zeros(4, 3)+1} + target_dict = {'target': torch.zeros(4, 3) + 1} metric(pred_dict=pred_dict, target_dict=target_dict) - self.assertDictEqual(metric.get_metric(), {'acc':0.5}) + self.assertDictEqual(metric.get_metric(), {'acc': 0.5}) - # def test_AccuaryMetric6(self): # (6) check numpy array is not acceptable try: @@ -99,9 +98,9 @@ def test_AccuaryMetric8(self): # (8) check map, does not match. use stop_fast_param to stop fast param map try: metric = AccuracyMetric(pred='predictions', target='targets') - pred_dict = {"prediction": torch.zeros(4, 3, 2), "stop_fast_param":1} + pred_dict = {"prediction": torch.zeros(4, 3, 2), "stop_fast_param": 1} target_dict = {'targets': torch.zeros(4, 3)} - metric(pred_dict=pred_dict, target_dict=target_dict, ) + metric(pred_dict=pred_dict, target_dict=target_dict, ) self.assertDictEqual(metric.get_metric(), {'acc': 1}) except Exception as e: print(e) @@ -112,7 +111,7 @@ def test_AccuaryMetric9(self): # (9) check map, include unused try: metric = AccuracyMetric(pred='prediction', target='targets') - pred_dict = {"prediction": torch.zeros(4, 3, 2), 'unused':1} + pred_dict = {"prediction": torch.zeros(4, 3, 2), 'unused': 1} target_dict = {'targets': torch.zeros(4, 3)} metric(pred_dict=pred_dict, target_dict=target_dict) self.assertDictEqual(metric.get_metric(), {'acc': 1}) From 62c63f159ac2212dec4d8b2cd70931af61919209 Mon Sep 17 00:00:00 2001 From: yh Date: Tue, 4 Dec 2018 16:22:41 +0800 Subject: [PATCH 52/67] test loss --- test/core/test_loss.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/core/test_loss.py b/test/core/test_loss.py index 9b77d0a1..060aefb3 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -300,3 +300,22 @@ def test_NLLLoss(self): b = torch.tensor([1, 0, 4]) ans = l1({"my_predict": a}, {"my_truth": b}) self.assertEqual(ans, torch.nn.functional.nll_loss(a, b)) + +class TestLosserError(unittest.TestCase): + def test_losser1(self): + # (1) only input, targets passed + pred_dict = {"pred": torch.zeros(4, 3)} + target_dict = {'target': torch.zeros(4).long()} + los = loss.CrossEntropyLoss() + + print(los(pred_dict=pred_dict, target_dict=target_dict)) + + # + def test_AccuracyMetric2(self): + # (2) with corrupted size + pred_dict = {"pred": torch.zeros(16, 3, 4)} + target_dict = {'target': torch.zeros(16, 3).long()} + los = loss.CrossEntropyLoss() + + print(los(pred_dict=pred_dict, target_dict=target_dict)) + From 52b1b18a76d3620f413d59967f1b9cb2f4ec650e Mon Sep 17 00:00:00 2001 From: yunfan Date: Tue, 4 Dec 2018 17:04:31 +0800 Subject: [PATCH 53/67] fix bugs in vocab --- fastNLP/core/vocabulary.py | 49 +++++++++++---------------------- test/core/test_trainer.py | 52 +++++++++++++++++++----------------- test/core/test_vocabulary.py | 20 +++++++------- 3 files changed, 53 insertions(+), 68 deletions(-) diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index ca6b4ebf..14577635 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -1,11 +1,4 @@ from collections import Counter -from copy import deepcopy - -DEFAULT_PADDING_LABEL = '' # dict index = 0 -DEFAULT_UNKNOWN_LABEL = '' # dict index = 1 - -DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1} - def isiterable(p_object): try: @@ -57,22 +50,16 @@ class Vocabulary(object): vocab.to_word(5) """ - def __init__(self, need_default=True, max_size=None, min_freq=None): + def __init__(self, max_size=None, min_freq=None, unknown='', padding=''): """ - :param bool need_default: set if the Vocabulary has default labels reserved for sequences. Default: True. :param int max_size: set the max number of words in Vocabulary. Default: None :param int min_freq: set the min occur frequency of words in Vocabulary. Default: None """ self.max_size = max_size self.min_freq = min_freq self.word_count = Counter() - self.has_default = need_default - if self.has_default: - self.padding_label = DEFAULT_PADDING_LABEL - self.unknown_label = DEFAULT_UNKNOWN_LABEL - else: - self.padding_label = None - self.unknown_label = None + self.unknown = unknown + self.padding = padding self.word2idx = None self.idx2word = None self.rebuild = True @@ -113,17 +100,18 @@ def build_vocab(self): """Build 'word to index' dict, and filter the word using `max_size` and `min_freq`. """ - if self.has_default: - self.word2idx = deepcopy(DEFAULT_WORD_TO_INDEX) - self.word2idx[self.unknown_label] = self.word2idx.pop(DEFAULT_UNKNOWN_LABEL) - self.word2idx[self.padding_label] = self.word2idx.pop(DEFAULT_PADDING_LABEL) - else: - self.word2idx = {} + self.word2idx = {} + if self.padding is not None: + self.word2idx[self.padding] = 0 + if self.unknown is not None: + self.word2idx[self.unknown] = 1 max_size = min(self.max_size, len(self.word_count)) if self.max_size else None words = self.word_count.most_common(max_size) if self.min_freq is not None: words = filter(lambda kv: kv[1] >= self.min_freq, words) + if self.word2idx is not None: + words = filter(lambda kv: kv[0] not in self.word2idx, words) start_idx = len(self.word2idx) self.word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)}) self.build_reverse_vocab() @@ -159,8 +147,8 @@ def __getitem__(self, w): """ if w in self.word2idx: return self.word2idx[w] - elif self.has_default: - return self.word2idx[self.unknown_label] + if self.unknown is not None: + return self.word2idx[self.unknown] else: raise ValueError("word {} not in vocabulary".format(w)) @@ -175,21 +163,16 @@ def to_index(self, w): @property @check_build_vocab def unknown_idx(self): - if self.unknown_label is None: + if self.unknown is None: return None - return self.word2idx[self.unknown_label] - - def __setattr__(self, name, val): - self.__dict__[name] = val - if name in ["unknown_label", "padding_label"]: - self.word2idx = None + return self.word2idx[self.unknown] @property @check_build_vocab def padding_idx(self): - if self.padding_label is None: + if self.padding is None: return None - return self.word2idx[self.padding_label] + return self.word2idx[self.padding] @check_build_vocab def to_word(self, idx): diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index 7903b403..1b578eae 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -4,6 +4,7 @@ import torch.nn.functional as F from torch import nn +from fastNLP.core.utils import CheckError from fastNLP.core.dataset import DataSet from fastNLP.core.instance import Instance from fastNLP.core.losses import BCELoss @@ -56,7 +57,8 @@ def test_case(self): dev_data=dev_set, optimizer=SGD(lr=0.1), check_code_level=2, - use_tqdm=True) + use_tqdm=True, + save_path=None) trainer.train() """ # 应该正确运行 @@ -145,16 +147,14 @@ def forward(self, x1, x2, y): return {'wrong_loss_key': loss} model = Model() - trainer = Trainer( - train_data=dataset, - model=model, - use_tqdm=False, - print_every=2 - ) - trainer.train() - """ - # 应该正确运行 - """ + with self.assertRaises(NameError): + trainer = Trainer( + train_data=dataset, + model=model, + use_tqdm=False, + print_every=2 + ) + trainer.train() def test_trainer_suggestion4(self): # 检查报错提示能否正确提醒用户 @@ -173,12 +173,13 @@ def forward(self, x1, x2, y): return {'loss': loss} model = Model() - trainer = Trainer( - train_data=dataset, - model=model, - use_tqdm=False, - print_every=2 - ) + with self.assertRaises(NameError): + trainer = Trainer( + train_data=dataset, + model=model, + use_tqdm=False, + print_every=2 + ) def test_trainer_suggestion5(self): # 检查报错提示能否正确提醒用户 @@ -225,14 +226,15 @@ def forward(self, x1, x2, y): return {'pred': x} model = Model() - trainer = Trainer( - train_data=dataset, - model=model, - dev_data=dataset, - metrics=AccuracyMetric(), - use_tqdm=False, - print_every=2 - ) + with self.assertRaises(NameError): + trainer = Trainer( + train_data=dataset, + model=model, + dev_data=dataset, + metrics=AccuracyMetric(), + use_tqdm=False, + print_every=2 + ) def test_case2(self): # check metrics Wrong diff --git a/test/core/test_vocabulary.py b/test/core/test_vocabulary.py index e453e935..af2c493b 100644 --- a/test/core/test_vocabulary.py +++ b/test/core/test_vocabulary.py @@ -10,36 +10,36 @@ class TestAdd(unittest.TestCase): def test_add(self): - vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) + vocab = Vocabulary(max_size=None, min_freq=None) for word in text: vocab.add(word) self.assertEqual(vocab.word_count, counter) def test_add_word(self): - vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) + vocab = Vocabulary(max_size=None, min_freq=None) for word in text: vocab.add_word(word) self.assertEqual(vocab.word_count, counter) def test_add_word_lst(self): - vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) + vocab = Vocabulary(max_size=None, min_freq=None) vocab.add_word_lst(text) self.assertEqual(vocab.word_count, counter) def test_update(self): - vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) + vocab = Vocabulary(max_size=None, min_freq=None) vocab.update(text) self.assertEqual(vocab.word_count, counter) class TestIndexing(unittest.TestCase): def test_len(self): - vocab = Vocabulary(need_default=False, max_size=None, min_freq=None) + vocab = Vocabulary(max_size=None, min_freq=None, unknown=None, padding=None) vocab.update(text) self.assertEqual(len(vocab), len(counter)) def test_contains(self): - vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) + vocab = Vocabulary(max_size=None, min_freq=None, unknown=None, padding=None) vocab.update(text) self.assertTrue(text[-1] in vocab) self.assertFalse("~!@#" in vocab) @@ -47,7 +47,7 @@ def test_contains(self): self.assertEqual("~!@#" in vocab, vocab.has_word("~!@#")) def test_index(self): - vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) + vocab = Vocabulary(max_size=None, min_freq=None) vocab.update(text) res = [vocab[w] for w in set(text)] self.assertEqual(len(res), len(set(res))) @@ -56,14 +56,14 @@ def test_index(self): self.assertEqual(len(res), len(set(res))) def test_to_word(self): - vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) + vocab = Vocabulary(max_size=None, min_freq=None) vocab.update(text) self.assertEqual(text, [vocab.to_word(idx) for idx in [vocab[w] for w in text]]) class TestOther(unittest.TestCase): def test_additional_update(self): - vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) + vocab = Vocabulary(max_size=None, min_freq=None) vocab.update(text) _ = vocab["well"] @@ -77,7 +77,7 @@ def test_additional_update(self): self.assertTrue("hahaha" in vocab) def test_warning(self): - vocab = Vocabulary(need_default=True, max_size=len(set(text)), min_freq=None) + vocab = Vocabulary(max_size=len(set(text)), min_freq=None) vocab.update(text) self.assertEqual(vocab.rebuild, True) print(len(vocab)) From 87e5d44b018cfd54b57f545159d5211e7a9e609c Mon Sep 17 00:00:00 2001 From: yh Date: Tue, 4 Dec 2018 22:44:54 +0800 Subject: [PATCH 54/67] =?UTF-8?q?=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/losses.py | 2 ++ test/core/test_dataset.py | 7 +++++++ test/core/test_loss.py | 12 ++++++++++-- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index c1e8de0e..58847c31 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -169,6 +169,8 @@ def __init__(self, func, key_map=None, **kwargs): class CrossEntropyLoss(LossBase): def __init__(self, pred=None, target=None): + # TODO 需要做一些检查,F.cross_entropy在计算时,如果pred是(16, 10 ,4), target的形状按道理应该是(16, 10), 但实际却需要 + # TODO (16, 4) super(CrossEntropyLoss, self).__init__() self.get_loss = F.cross_entropy self._init_param_map(input=pred, target=target) diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index 8ca2ed86..697bcd78 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -125,6 +125,13 @@ def test_get_target_name(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target]) + def test_apply2(self): + def split_sent(ins): + return ins['raw_sentence'].split() + dataset = DataSet.read_csv('../../sentence.csv', headers=('raw_sentence', 'label'), sep='\t') + dataset.apply(split_sent, new_field_name='words') + # print(dataset) + class TestDataSetIter(unittest.TestCase): def test__repr__(self): diff --git a/test/core/test_loss.py b/test/core/test_loss.py index 53b889c6..270b4d3b 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -311,9 +311,17 @@ def test_losser1(self): print(los(pred_dict=pred_dict, target_dict=target_dict)) # - def test_AccuracyMetric2(self): + def test_losser2(self): # (2) with corrupted size - pred_dict = {"pred": torch.zeros(16, 3, 4)} + pred_dict = {"pred": torch.zeros(16, 3)} + target_dict = {'target': torch.zeros(16, 3).long()} + los = loss.CrossEntropyLoss() + + print(los(pred_dict=pred_dict, target_dict=target_dict)) + + def test_losser3(self): + # (2) with corrupted size + pred_dict = {"pred": torch.zeros(16, 3), 'stop_fast_param':0} target_dict = {'target': torch.zeros(16, 3).long()} los = loss.CrossEntropyLoss() From f26f11608baa202ab18ee627e75e4229a62b6d06 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Tue, 4 Dec 2018 22:57:26 +0800 Subject: [PATCH 55/67] =?UTF-8?q?*=20=E6=9B=B4=E6=96=B0=E6=95=99=E7=A8=8B?= =?UTF-8?q?=EF=BC=8C=E6=94=BE=E5=9C=A8=E5=9C=A8./tutorial=20*=20remove=20u?= =?UTF-8?q?nused=20codes=20in=20metrics.py=20*=20add=20tests=20for=20DataS?= =?UTF-8?q?et=20*=20add=20tests=20for=20FieldArray=20*=20add=20tests=20for?= =?UTF-8?q?=20metrics.py=20*=20fix=20predictor,=20add=20tests=20for=20pred?= =?UTF-8?q?ictor=20*=20fix=20bucket=20sampler,=20add=20tests=20for=20bucke?= =?UTF-8?q?t=20sampler?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/__init__.py | 1 - fastNLP/core/dataset.py | 5 +- fastNLP/core/metrics.py | 116 +-- fastNLP/core/predictor.py | 4 +- fastNLP/core/sampler.py | 2 +- fastNLP/core/vocabulary.py | 7 - test/core/test_dataset.py | 23 + test/core/test_fieldarray.py | 22 + test/core/test_metrics.py | 13 + test/core/test_predictor.py | 30 +- test/core/test_sampler.py | 12 +- tutorials/fastnlp_tutorial_1204.ipynb | 1209 +++++++++++++++++++++++++ 12 files changed, 1316 insertions(+), 128 deletions(-) create mode 100644 tutorials/fastnlp_tutorial_1204.ipynb diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index dfe35f77..b16fe165 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -3,7 +3,6 @@ from .fieldarray import FieldArray from .instance import Instance from .losses import Loss -from .metrics import Evaluator, ClassifyEvaluator, SNLIEvaluator, SeqLabelEvaluator from .optimizer import Optimizer from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler from .tester import Tester diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index cdca4356..3dbea8eb 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -1,4 +1,5 @@ import _pickle as pickle + import numpy as np from fastNLP.core.fieldarray import FieldArray @@ -66,10 +67,12 @@ class Iter_ptr: def __init__(self, dataset, idx): self.dataset = dataset self.idx = idx + def __getitem__(self, item): assert item in self.dataset.field_arrays, "no such field:{} in instance {}".format(item, self.dataset[self.idx]) assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx) return self.dataset.field_arrays[item][self.idx] + def __repr__(self): return self.dataset[self.idx].__repr__() @@ -339,6 +342,6 @@ def save(self, path): pickle.dump(self, f) @staticmethod - def load(self, path): + def load(path): with open(path, 'rb') as f: return pickle.load(f) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index c17d408b..5d808f6a 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -304,118 +304,6 @@ def _prepare_metrics(metrics): return _metrics -class Evaluator(object): - def __init__(self): - pass - - def __call__(self, predict, truth): - """ - - :param predict: list of tensors, the network outputs from all batches. - :param truth: list of dict, the ground truths from all batch_y. - :return: - """ - raise NotImplementedError - - -class ClassifyEvaluator(Evaluator): - def __init__(self): - super(ClassifyEvaluator, self).__init__() - - def __call__(self, predict, truth): - y_prob = [torch.nn.functional.softmax(y_logit, dim=-1) for y_logit in predict] - y_prob = torch.cat(y_prob, dim=0) - y_pred = torch.argmax(y_prob, dim=-1) - y_true = torch.cat(truth, dim=0) - acc = float(torch.sum(y_pred == y_true)) / len(y_true) - return {"accuracy": acc} - - -class SeqLabelEvaluator(Evaluator): - def __init__(self): - super(SeqLabelEvaluator, self).__init__() - - def __call__(self, predict, truth, **_): - """ - - :param predict: list of List, the network outputs from all batches. - :param truth: list of dict, the ground truths from all batch_y. - :return accuracy: - """ - total_correct, total_count = 0., 0. - for x, y in zip(predict, truth): - x = torch.tensor(x) - y = y.to(x) # make sure they are in the same device - mask = (y > 0) - correct = torch.sum(((x == y) * mask).long()) - total_correct += float(correct) - total_count += float(torch.sum(mask.long())) - accuracy = total_correct / total_count - return {"accuracy": float(accuracy)} - - -class SeqLabelEvaluator2(Evaluator): - # 上面的evaluator应该是错误的 - def __init__(self, seq_lens_field_name='word_seq_origin_len'): - super(SeqLabelEvaluator2, self).__init__() - self.end_tagidx_set = set() - self.seq_lens_field_name = seq_lens_field_name - - def __call__(self, predict, truth, **_): - """ - - :param predict: list of batch, the network outputs from all batches. - :param truth: list of dict, the ground truths from all batch_y. - :return accuracy: - """ - seq_lens = _[self.seq_lens_field_name] - corr_count = 0 - pred_count = 0 - truth_count = 0 - for x, y, seq_len in zip(predict, truth, seq_lens): - x = x.cpu().numpy() - y = y.cpu().numpy() - for idx, s_l in enumerate(seq_len): - x_ = x[idx] - y_ = y[idx] - x_ = x_[:s_l] - y_ = y_[:s_l] - flag = True - start = 0 - for idx_i, (x_i, y_i) in enumerate(zip(x_, y_)): - if x_i in self.end_tagidx_set: - truth_count += 1 - for j in range(start, idx_i + 1): - if y_[j] != x_[j]: - flag = False - break - if flag: - corr_count += 1 - flag = True - start = idx_i + 1 - if y_i in self.end_tagidx_set: - pred_count += 1 - P = corr_count / (float(pred_count) + 1e-6) - R = corr_count / (float(truth_count) + 1e-6) - F = 2 * P * R / (P + R + 1e-6) - - return {"P": P, 'R': R, 'F': F} - - -class SNLIEvaluator(Evaluator): - def __init__(self): - super(SNLIEvaluator, self).__init__() - - def __call__(self, predict, truth): - y_prob = [torch.nn.functional.softmax(y_logit, dim=-1) for y_logit in predict] - y_prob = torch.cat(y_prob, dim=0) - y_pred = torch.argmax(y_prob, dim=-1) - truth = [t['truth'] for t in truth] - y_true = torch.cat(truth, dim=0).view(-1) - acc = float(torch.sum(y_pred == y_true)) / y_true.size(0) - return {"accuracy": acc} - - def _conver_numpy(x): """convert input data to numpy array @@ -467,11 +355,11 @@ def _check_data(y_true, y_pred): type_true, y_true = _label_types(y_true) type_pred, y_pred = _label_types(y_pred) - type_set = set(['binary', 'multiclass']) + type_set = {'binary', 'multiclass'} if type_true in type_set and type_pred in type_set: return type_true if type_true == type_pred else 'multiclass', y_true, y_pred - type_set = set(['multiclass-multioutput', 'multilabel']) + type_set = {'multiclass-multioutput', 'multilabel'} if type_true in type_set and type_pred in type_set: return type_true if type_true == type_pred else 'multiclass-multioutput', y_true, y_pred diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py index 7cde4844..9ce1d792 100644 --- a/fastNLP/core/predictor.py +++ b/fastNLP/core/predictor.py @@ -23,13 +23,13 @@ def predict(self, network, data): :param network: a PyTorch model (cpu) :param data: a DataSet object. - :return: list of list of strings, [num_examples, tag_seq_length] + :return: list of batch outputs """ # turn on the testing mode; clean up the history self.mode(network, test=True) batch_output = [] - data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), use_cuda=False) + data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False) for batch_x, _ in data_iterator: with torch.no_grad(): diff --git a/fastNLP/core/sampler.py b/fastNLP/core/sampler.py index f5e83c6b..d568acf3 100644 --- a/fastNLP/core/sampler.py +++ b/fastNLP/core/sampler.py @@ -55,7 +55,7 @@ def __init__(self, num_buckets=10, batch_size=32, seq_lens_field_name='seq_lens' def __call__(self, data_set): - seq_lens = data_set[self.seq_lens_field_name].content + seq_lens = data_set.get_fields()[self.seq_lens_field_name].content total_sample_num = len(seq_lens) bucket_indexes = [] diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index 14577635..e8cc0e22 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -1,12 +1,5 @@ from collections import Counter -def isiterable(p_object): - try: - _ = iter(p_object) - except TypeError: - return False - return True - def check_build_vocab(func): """A decorator to make sure the indexing is built before used. diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index 8ca2ed86..a4deb304 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -1,3 +1,4 @@ +import os import unittest from fastNLP.core.dataset import DataSet @@ -90,6 +91,18 @@ def test_apply(self): self.assertTrue("rx" in ds.field_arrays) self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1]) + ds.apply(lambda ins: len(ins["y"]), new_field_name="y") + self.assertEqual(ds.field_arrays["y"].content[0], 2) + + res = ds.apply(lambda ins: len(ins["x"])) + self.assertTrue(isinstance(res, list) and len(res) > 0) + self.assertTrue(res[0], 4) + + def test_drop(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6], [7, 8, 9, 0]] * 20}) + ds.drop(lambda ins: len(ins["y"]) < 3) + self.assertEqual(len(ds), 20) + def test_contains(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) self.assertTrue("x" in ds) @@ -125,9 +138,19 @@ def test_get_target_name(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target]) + def test_save_load(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) + ds.save("./my_ds.pkl") + self.assertTrue(os.path.exists("./my_ds.pkl")) + + ds_1 = DataSet.load("./my_ds.pkl") + os.remove("my_ds.pkl") + # 能跑通就行 + class TestDataSetIter(unittest.TestCase): def test__repr__(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) for iter in ds: self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4],\n'y': [5, 6]}") + diff --git a/test/core/test_fieldarray.py b/test/core/test_fieldarray.py index c22bac5b..c0b8a592 100644 --- a/test/core/test_fieldarray.py +++ b/test/core/test_fieldarray.py @@ -75,3 +75,25 @@ def test_getitem_v2(self): indices = [0, 1, 3, 4, 6] for a, b in zip(fa[indices], x[indices]): self.assertListEqual(a.tolist(), b.tolist()) + + def test_append(self): + with self.assertRaises(Exception): + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) + fa.append(0) + + with self.assertRaises(Exception): + fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True) + fa.append([1, 2, 3, 4, 5]) + + with self.assertRaises(Exception): + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) + fa.append([]) + + with self.assertRaises(Exception): + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) + fa.append(["str", 0, 0, 0, 1.89]) + + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) + fa.append([1.2, 2.3, 3.4, 4.5, 5.6]) + self.assertEqual(len(fa), 3) + self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6]) diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py index 76352aba..9286a26f 100644 --- a/test/core/test_metrics.py +++ b/test/core/test_metrics.py @@ -4,6 +4,7 @@ import torch from fastNLP.core.metrics import AccuracyMetric +from fastNLP.core.metrics import accuracy_score, recall_score, precision_score, f1_score class TestAccuracyMetric(unittest.TestCase): @@ -132,3 +133,15 @@ def test_AccuaryMetric10(self): print(e) return self.assertTrue(True, False), "No exception catches." + + +class TestUsefulFunctions(unittest.TestCase): + # 测试metrics.py中一些看上去挺有用的函数 + def test_case_1(self): + # multi-class + _ = accuracy_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1))) + _ = precision_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None) + _ = recall_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None) + _ = f1_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None) + + # 跑通即可 diff --git a/test/core/test_predictor.py b/test/core/test_predictor.py index 7b4f5da9..8be5f289 100644 --- a/test/core/test_predictor.py +++ b/test/core/test_predictor.py @@ -1,6 +1,34 @@ import unittest +import numpy as np +import torch + +from fastNLP.core.dataset import DataSet +from fastNLP.core.instance import Instance +from fastNLP.core.predictor import Predictor +from fastNLP.modules.encoder.linear import Linear + + +def prepare_fake_dataset(): + mean = np.array([-3, -3]) + cov = np.array([[1, 0], [0, 1]]) + class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) + + mean = np.array([3, 3]) + cov = np.array([[1, 0], [0, 1]]) + class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) + + data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + + [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) + return data_set + class TestPredictor(unittest.TestCase): def test(self): - pass + predictor = Predictor() + model = Linear(2, 1) + data = prepare_fake_dataset() + data.set_input("x") + ans = predictor.predict(model, data) + self.assertEqual(len(ans), 2000) + self.assertTrue(isinstance(ans[0], torch.Tensor)) diff --git a/test/core/test_sampler.py b/test/core/test_sampler.py index 5da0e6db..b23af470 100644 --- a/test/core/test_sampler.py +++ b/test/core/test_sampler.py @@ -1,9 +1,11 @@ +import random import unittest import torch +from fastNLP.core.dataset import DataSet from fastNLP.core.sampler import convert_to_torch_tensor, SequentialSampler, RandomSampler, \ - k_means_1d, k_means_bucketing, simple_sort_bucketing + k_means_1d, k_means_bucketing, simple_sort_bucketing, BucketSampler class TestSampler(unittest.TestCase): @@ -40,3 +42,11 @@ def test_k_means_bucketing(self): def test_simple_sort_bucketing(self): _ = simple_sort_bucketing([21, 3, 25, 7, 9, 22, 4, 6, 28, 10]) assert len(_) == 10 + + def test_BucketSampler(self): + sampler = BucketSampler(num_buckets=3, batch_size=16, seq_lens_field_name="seq_len") + data_set = DataSet({"x": [[0] * random.randint(1, 10)] * 10, "y": [[5, 6]] * 10}) + data_set.apply(lambda ins: len(ins["x"]), new_field_name="seq_len") + indices = sampler(data_set) + self.assertEqual(len(indices), 10) + # 跑通即可,不验证效果 diff --git a/tutorials/fastnlp_tutorial_1204.ipynb b/tutorials/fastnlp_tutorial_1204.ipynb new file mode 100644 index 00000000..1a002750 --- /dev/null +++ b/tutorials/fastnlp_tutorial_1204.ipynb @@ -0,0 +1,1209 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "fastNLP上手教程\n", + "-------\n", + "\n", + "fastNLP提供方便的数据预处理,训练和测试模型的功能" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('C:/Users/zyfeng/Desktop/FudanNLP/fastNLP')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DataSet & Instance\n", + "------\n", + "\n", + "fastNLP用DataSet和Instance保存和处理数据。每个DataSet表示一个数据集,每个Instance表示一个数据样本。一个DataSet存有多个Instance,每个Instance可以自定义存哪些内容。\n", + "\n", + "有一些read_*方法,可以轻松从文件读取数据,存成DataSet。" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "38" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from fastNLP import DataSet\n", + "from fastNLP import Instance\n", + "\n", + "# 从csv读取数据到DataSet\n", + "dataset = DataSet.read_csv('./test/data_for_tests/tutorial_sample_dataset.csv', headers=('raw_sentence', 'label'), sep='\\t')\n", + "print(len(dataset))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n'label': 1}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# 使用数字索引[k],获取第k个样本\n", + "print(dataset[0])\n", + "\n", + "# 索引也可以是负数\n", + "print(dataset[-3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instance\n", + "Instance表示一个样本,由一个或多个field(域,属性,特征)组成,每个field有名字和值。\n", + "\n", + "在初始化Instance时即可定义它包含的域,使用 \"field_name=field_value\"的写法。" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'raw_sentence': fake data,\n'label': 0}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DataSet.append(Instance)加入新数据\n", + "dataset.append(Instance(raw_sentence='fake data', label='0'))\n", + "dataset[-1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataSet.apply方法\n", + "数据预处理利器" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n'label': 1}" + ] + } + ], + "source": [ + "# 将所有数字转为小写\n", + "dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')\n", + "print(dataset[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n'label': 1}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# label转int\n", + "dataset.apply(lambda x: int(x['label']), new_field_name='label')\n", + "print(dataset[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n'label': 1,\n'words': ['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.']}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# 使用空格分割句子\n", + "def split_sent(ins):\n", + " return ins['raw_sentence'].split()\n", + "dataset.apply(split_sent, new_field_name='words')\n", + "print(dataset[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n'label': 1,\n'words': ['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.'],\n'seq_len': 37}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# 增加长度信息\n", + "dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')\n", + "print(dataset[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataSet.drop\n", + "筛选数据" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "38" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "dataset.drop(lambda x: x['seq_len'] <= 3)\n", + "print(len(dataset))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 配置DataSet\n", + "1. 哪些域是特征,哪些域是标签\n", + "2. 切分训练集/验证集" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# 设置DataSet中,哪些field要转为tensor\n", + "\n", + "# set target,loss或evaluate中的golden,计算loss,模型评估时使用\n", + "dataset.set_target(\"label\")\n", + "# set input,模型forward时使用\n", + "dataset.set_input(\"words\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "27" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11" + ] + } + ], + "source": [ + "# 分出测试集、训练集\n", + "\n", + "test_data, train_data = dataset.split(0.3)\n", + "print(len(test_data))\n", + "print(len(train_data))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Vocabulary\n", + "------\n", + "\n", + "fastNLP中的Vocabulary轻松构建词表,将词转成数字" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': that the chuck norris `` grenade gag '' occurs about 7 times during windtalkers is a good indication of how serious-minded the film is .,\n'label': 2,\n'words': [6, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 8, 24, 1, 5, 1, 1, 2, 15, 10, 3],\n'seq_len': 25}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from fastNLP import Vocabulary\n", + "\n", + "# 构建词表, Vocabulary.add(word)\n", + "vocab = Vocabulary(min_freq=2)\n", + "train_data.apply(lambda x: [vocab.add(word) for word in x['words']])\n", + "vocab.build_vocab()\n", + "\n", + "# index句子, Vocabulary.to_index(word)\n", + "train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')\n", + "test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')\n", + "\n", + "\n", + "print(test_data[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model\n", + "定义一个PyTorch模型" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CNNText(\n (embed): Embedding(\n (embed): Embedding(32, 50, padding_idx=0)\n (dropout): Dropout(p=0.0)\n )\n (conv_pool): ConvMaxpool(\n (convs): ModuleList(\n (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n )\n )\n (dropout): Dropout(p=0.1)\n (fc): Linear(\n (linear): Linear(in_features=12, out_features=5, bias=True)\n )\n)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from fastNLP.models import CNNText\n", + "model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)\n", + "model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这是上述模型的forward方法。如果你不知道什么是forward方法,请参考我们的PyTorch教程。\n", + "\n", + "注意两点:\n", + "1. forward参数名字叫**word_seq**,请记住。\n", + "2. forward的返回值是一个**dict**,其中有个key的名字叫**output**。\n", + "\n", + "```Python\n", + " def forward(self, word_seq):\n", + " \"\"\"\n", + "\n", + " :param word_seq: torch.LongTensor, [batch_size, seq_len]\n", + " :return output: dict of torch.LongTensor, [batch_size, num_classes]\n", + " \"\"\"\n", + " x = self.embed(word_seq) # [N,L] -> [N,L,C]\n", + " x = self.conv_pool(x) # [N,L,C] -> [N,C]\n", + " x = self.dropout(x)\n", + " x = self.fc(x) # [N,C] -> [N, N_class]\n", + " return {'output': x}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这是上述模型的predict方法,是用来直接输出该任务的预测结果,与forward目的不同。\n", + "\n", + "注意两点:\n", + "1. predict参数名也叫**word_seq**。\n", + "2. predict的返回值是也一个**dict**,其中有个key的名字叫**predict**。\n", + "\n", + "```\n", + " def predict(self, word_seq):\n", + " \"\"\"\n", + "\n", + " :param word_seq: torch.LongTensor, [batch_size, seq_len]\n", + " :return predict: dict of torch.LongTensor, [batch_size, seq_len]\n", + " \"\"\"\n", + " output = self(word_seq)\n", + " _, predict = output['output'].max(dim=1)\n", + " return {'predict': predict}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Trainer & Tester\n", + "------\n", + "\n", + "使用fastNLP的Trainer训练模型" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from fastNLP import Trainer\n", + "from copy import deepcopy\n", + "from fastNLP.core.losses import CrossEntropyLoss\n", + "from fastNLP.core.metrics import AccuracyMetric\n", + "\n", + "\n", + "# 更改DataSet中对应field的名称,与模型的forward的参数名一致\n", + "# 因为forward的参数叫word_seq, 所以要把原本叫words的field改名为word_seq\n", + "# 这里的演示是让你了解这种**命名规则**\n", + "train_data.rename_field('words', 'word_seq')\n", + "test_data.rename_field('words', 'word_seq')\n", + "\n", + "# 顺便把label换名为label_seq\n", + "train_data.rename_field('label', 'label_seq')\n", + "test_data.rename_field('label', 'label_seq')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### loss\n", + "训练模型需要提供一个损失函数\n", + "\n", + "下面提供了一个在分类问题中常用的交叉熵损失。注意它的**初始化参数**。\n", + "\n", + "pred参数对应的是模型的forward返回的dict的一个key的名字,这里是\"output\"。\n", + "\n", + "target参数对应的是dataset作为标签的field的名字,这里是\"label_seq\"。" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "loss = CrossEntropyLoss(pred=\"output\", target=\"label_seq\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Metric\n", + "定义评价指标\n", + "\n", + "这里使用准确率。参数的“命名规则”跟上面类似。\n", + "\n", + "pred参数对应的是模型的predict方法返回的dict的一个key的名字,这里是\"predict\"。\n", + "\n", + "target参数对应的是dataset作为标签的field的名字,这里是\"label_seq\"。" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "metric = AccuracyMetric(pred=\"predict\", target=\"label_seq\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "training epochs started 2018-12-04 22:51:24" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\rEpoch 1/5: 0%| | 0/5 [00:00 Date: Tue, 4 Dec 2018 23:18:37 +0800 Subject: [PATCH 56/67] =?UTF-8?q?=E4=BF=AE=E6=94=B9losses=E4=B8=AD?= =?UTF-8?q?=E7=9B=B4=E6=8E=A5=E4=BD=BF=E7=94=A8F.cross=5Fentropy=E7=9A=84?= =?UTF-8?q?=E6=83=85=E5=86=B5=EF=BC=8C=E5=9B=A0=E4=B8=BA=E8=BF=99=E4=BA=9B?= =?UTF-8?q?=E5=87=BD=E6=95=B0=E7=9A=84signature=E6=98=AF(input,=20target)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/losses.py | 139 +++++++++++++++++++------------------- test/core/test_loss.py | 2 +- test/core/test_trainer.py | 8 +-- 3 files changed, 76 insertions(+), 73 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 58847c31..3bbbf9e2 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -8,8 +8,7 @@ from fastNLP.core.utils import CheckRes from fastNLP.core.utils import _build_args from fastNLP.core.utils import _check_function_or_method -from fastNLP.core.utils import _get_arg_list -from fastNLP.core.utils import _map_args +from fastNLP.core.utils import _check_arg_dict_list from fastNLP.core.utils import get_func_signature @@ -62,8 +61,7 @@ def _init_param_map(self, key_map=None, **kwargs): if func_param not in func_args: raise NameError( f"Parameter `{func_param}` is not in {get_func_signature(self.get_loss)}. Please check the " - f"initialization parameters, or change the signature of" - f" {get_func_signature(self.get_loss)}.") + f"initialization parameters, or change its signature.") # evaluate should not have varargs. if func_spect.varargs: @@ -87,71 +85,68 @@ def __call__(self, pred_dict, target_dict, check=False): loss = self.get_loss(*fast_param) return loss - args, defaults, defaults_val, varargs, kwargs = _get_arg_list(self.get_loss) - if varargs is not None: - raise RuntimeError( - f"The function {get_func_signature(self.get_loss)} should not use Positional Argument." - ) - - param_map = self.param_map - if args is None: - raise RuntimeError( - f"There is not any param in function{get_func_signature(self.get_loss)}" - ) - - self._checked = self._checked and not check if not self._checked: - for keys in args: - if keys not in param_map: - param_map.update({keys: keys}) - if defaults is not None: - for keys in defaults: - if keys not in param_map: - param_map.update({keys: keys}) - self.param_map = param_map - # param map: key= name in get_loss function, value= name in param dict - reversed_param_map = {val: key for key, val in param_map.items()} - # reversed param map: key= name in param dict, value= name in get_loss function - + # 1. check consistence between signature and param_map + func_spect = inspect.getfullargspec(self.get_loss) + func_args = set([arg for arg in func_spect.args if arg != 'self']) + for func_arg, input_arg in self.param_map.items(): + if func_arg not in func_args: + raise NameError(f"`{func_arg}` not in {get_func_signature(self.get_loss)}.") + + # 2. only part of the param_map are passed, left are not + for arg in func_args: + if arg not in self.param_map: + self.param_map[arg] = arg # This param does not need mapping. + self._evaluate_args = func_args + self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self.param_map.items()} + + # need to wrap inputs in dict. + mapped_pred_dict = {} + mapped_target_dict = {} duplicated = [] - missing = [] - if not self._checked: - for keys, val in pred_dict.items(): - if keys in target_dict.keys(): - duplicated.append(param_map[keys]) - - param_val_dict = {} - for keys, val in pred_dict.items(): - param_val_dict.update({keys: val}) - for keys, val in target_dict.items(): - param_val_dict.update({keys: val}) - + for input_arg in set(list(pred_dict.keys()) + list(target_dict.keys())): + not_duplicate_flag = 0 + if input_arg in self._reverse_param_map: + mapped_arg = self._reverse_param_map[input_arg] + not_duplicate_flag += 1 + else: + mapped_arg = input_arg + if input_arg in pred_dict: + mapped_pred_dict[mapped_arg] = pred_dict[input_arg] + not_duplicate_flag += 1 + if input_arg in target_dict: + mapped_target_dict[mapped_arg] = target_dict[input_arg] + not_duplicate_flag += 1 + if not_duplicate_flag == 3: + duplicated.append(input_arg) + + # missing if not self._checked: - for keys in args: - if param_map[keys] not in param_val_dict.keys(): - missing.append(param_map[keys]) - - if len(duplicated) > 0 or len(missing) > 0: - raise CheckError( - CheckRes(missing=missing, unused=[], duplicated=duplicated, required=[], all_needed=[], - varargs=varargs), - func_signature=get_func_signature(self.get_loss) - ) - + check_res = _check_arg_dict_list(self.get_loss, [mapped_pred_dict, mapped_target_dict]) + # only check missing. + missing = check_res.missing + replaced_missing = list(missing) + for idx, func_arg in enumerate(missing): + replaced_missing[idx] = f"{self.param_map[func_arg]}" + f"(assign to `{func_arg}` " \ + f"in `{self.__class__.__name__}`)" + + check_res = CheckRes(missing=replaced_missing, + unused=check_res.unused, + duplicated=duplicated, + required=check_res.required, + all_needed=check_res.all_needed, + varargs=check_res.varargs) + + if check_res.missing or check_res.duplicated or check_res.varargs: + raise CheckError(check_res=check_res, + func_signature=get_func_signature(self.get_loss)) + refined_args = _build_args(self.get_loss, **mapped_pred_dict, **mapped_target_dict) + + loss = self.get_loss(**refined_args) self._checked = True - param_map_val = _map_args(reversed_param_map, **param_val_dict) - param_value = _build_args(self.get_loss, **param_map_val) - loss = self.get_loss(**param_value) - - if not (isinstance(loss, torch.Tensor) and len(loss.size()) == 0): - if not isinstance(loss, torch.Tensor): - raise RuntimeError(f"loss ERROR: loss except a torch.Tensor but get {type(loss)}") - raise RuntimeError(f"loss ERROR: the size of loss except torch.Size([]) but got {loss.size()}") - return loss - class LossFunc(LossBase): def __init__(self, func, key_map=None, **kwargs): super(LossFunc, self).__init__() @@ -168,34 +163,42 @@ def __init__(self, func, key_map=None, **kwargs): class CrossEntropyLoss(LossBase): - def __init__(self, pred=None, target=None): + def __init__(self, pred=None, target=None, padding_idx=-100): # TODO 需要做一些检查,F.cross_entropy在计算时,如果pred是(16, 10 ,4), target的形状按道理应该是(16, 10), 但实际却需要 # TODO (16, 4) super(CrossEntropyLoss, self).__init__() - self.get_loss = F.cross_entropy - self._init_param_map(input=pred, target=target) + self._init_param_map(pred=pred, target=target) + self.padding_idx = padding_idx + def get_loss(self, pred, target): + return F.cross_entropy(input=pred, target=target, + ignore_index=self.padding_idx) class L1Loss(LossBase): def __init__(self, pred=None, target=None): super(L1Loss, self).__init__() - self.get_loss = F.l1_loss self._init_param_map(input=pred, target=target) + def get_loss(self, pred, target): + return F.l1_loss(input=pred, target=target) + class BCELoss(LossBase): def __init__(self, pred=None, target=None): super(BCELoss, self).__init__() - self.get_loss = F.binary_cross_entropy self._init_param_map(input=pred, target=target) + def get_loss(self, pred, target): + return F.binary_cross_entropy(input=pred, target=target) class NLLLoss(LossBase): def __init__(self, pred=None, target=None): super(NLLLoss, self).__init__() - self.get_loss = F.nll_loss self._init_param_map(input=pred, target=target) + def get_loss(self, pred, target): + return F.nll_loss(input=pred, target=target) + class LossInForward(LossBase): def __init__(self, loss_key='loss'): diff --git a/test/core/test_loss.py b/test/core/test_loss.py index 270b4d3b..22f11234 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -322,7 +322,7 @@ def test_losser2(self): def test_losser3(self): # (2) with corrupted size pred_dict = {"pred": torch.zeros(16, 3), 'stop_fast_param':0} - target_dict = {'target': torch.zeros(16, 3).long()} + target_dict = {'target': torch.zeros(16).long()} los = loss.CrossEntropyLoss() print(los(pred_dict=pred_dict, target_dict=target_dict)) diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index 1b578eae..e74ec4b5 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -8,7 +8,7 @@ from fastNLP.core.dataset import DataSet from fastNLP.core.instance import Instance from fastNLP.core.losses import BCELoss -from fastNLP.core.losses import LossInForward +from fastNLP.core.losses import CrossEntropyLoss from fastNLP.core.metrics import AccuracyMetric from fastNLP.core.optimizer import SGD from fastNLP.core.trainer import Trainer @@ -222,7 +222,7 @@ def forward(self, x1, x2, y): x1 = self.fc(x1) x2 = self.fc(x2) x = x1 + x2 - loss = F.cross_entropy(x, y) + # loss = F.cross_entropy(x, y) return {'pred': x} model = Model() @@ -231,10 +231,10 @@ def forward(self, x1, x2, y): train_data=dataset, model=model, dev_data=dataset, + losser=CrossEntropyLoss(), metrics=AccuracyMetric(), use_tqdm=False, - print_every=2 - ) + print_every=2) def test_case2(self): # check metrics Wrong From 5855adbc03d108404d445e8c941efd3448bd30ba Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Tue, 4 Dec 2018 23:30:54 +0800 Subject: [PATCH 57/67] fix FieldArray bug: do type check only when is_target or is_input is True --- fastNLP/core/fieldarray.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index 2340cd13..e1d7a032 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -45,8 +45,9 @@ def is_input(self): @is_input.setter def is_input(self, value): - self.pytype = self._type_detection(self.content) - self.dtype = self._map_to_np_type(self.pytype) + if value is True: + self.pytype = self._type_detection(self.content) + self.dtype = self._map_to_np_type(self.pytype) self._is_input = value @property @@ -55,8 +56,9 @@ def is_target(self): @is_target.setter def is_target(self, value): - self.pytype = self._type_detection(self.content) - self.dtype = self._map_to_np_type(self.pytype) + if value is True: + self.pytype = self._type_detection(self.content) + self.dtype = self._map_to_np_type(self.pytype) self._is_target = value def _type_detection(self, content): From 1158556236c438ebbae65ca7b373116da647483e Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 5 Dec 2018 20:15:59 +0800 Subject: [PATCH 58/67] =?UTF-8?q?1.=20=E4=BC=98=E5=8C=96trainer=20checkcod?= =?UTF-8?q?e=E8=BF=87=E7=A8=8B=E7=9A=84=E6=8A=A5=E9=94=99=E4=BF=A1?= =?UTF-8?q?=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/dataset.py | 2 +- fastNLP/core/fieldarray.py | 7 +- fastNLP/core/losses.py | 11 +- fastNLP/core/metrics.py | 2 + fastNLP/core/trainer.py | 24 +- fastNLP/core/utils.py | 98 +++--- requirements.txt | 2 +- test/core/test_dataset.py | 9 +- test/core/test_tester.py | 60 +++- test/core/test_trainer.py | 9 +- tutorials/fastnlp_tutorial_1204.ipynb | 415 +++----------------------- 11 files changed, 186 insertions(+), 453 deletions(-) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 3dbea8eb..57171e25 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -69,7 +69,7 @@ def __init__(self, dataset, idx): self.idx = idx def __getitem__(self, item): - assert item in self.dataset.field_arrays, "no such field:{} in instance {}".format(item, self.dataset[self.idx]) + assert item in self.dataset.field_arrays, "no such field:{} in Instance {}".format(item, self.dataset[self.idx]) assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx) return self.dataset.field_arrays[item][self.idx] diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index e1d7a032..5167be35 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -83,7 +83,8 @@ def _type_detection(self, content): elif isinstance(content, list): # content is a 1-D list if len(content) == 0: - raise RuntimeError("Cannot create FieldArray with an empty list.") + # the old error is not informative enough. + raise RuntimeError("Cannot create FieldArray with an empty list. Or one element in the list is empty.") type_set = set([type(item) for item in content]) if len(type_set) == 1 and tuple(type_set)[0] in self.BASIC_TYPES: @@ -164,11 +165,13 @@ def get(self, indices): # TODO 当这个fieldArray是seq_length这种只有一位的内容时,不需要padding,需要再讨论一下 if not is_iterable(self.content[0]): array = np.array([self.content[i] for i in indices], dtype=self.dtype) - else: + elif self.dtype in (np.int64, np.float64): max_len = max([len(self.content[i]) for i in indices]) array = np.full((batch_size, max_len), self.padding_val, dtype=self.dtype) for i, idx in enumerate(indices): array[i][:len(self.content[idx])] = self.content[idx] + else: # should only be str + array = np.array([self.content[i] for i in indices]) return array def __len__(self): diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 2a9e89cd..a4976540 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -80,7 +80,7 @@ def _fast_param_map(self, pred_dict, target_dict): fast_param = {} if len(self.param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1: fast_param['pred'] = list(pred_dict.values())[0] - fast_param['target'] = list(pred_dict.values())[0] + fast_param['target'] = list(target_dict.values())[0] return fast_param return fast_param @@ -134,10 +134,11 @@ def __call__(self, pred_dict, target_dict, check=False): # missing if not self._checked: check_res = _check_arg_dict_list(self.get_loss, [mapped_pred_dict, mapped_target_dict]) - # only check missing. + # replace missing. missing = check_res.missing replaced_missing = list(missing) for idx, func_arg in enumerate(missing): + # Don't delete `` in this information, nor add `` replaced_missing[idx] = f"{self.param_map[func_arg]}" + f"(assign to `{func_arg}` " \ f"in `{self.__class__.__name__}`)" @@ -188,7 +189,7 @@ def get_loss(self, pred, target): class L1Loss(LossBase): def __init__(self, pred=None, target=None): super(L1Loss, self).__init__() - self._init_param_map(input=pred, target=target) + self._init_param_map(pred=pred, target=target) def get_loss(self, pred, target): return F.l1_loss(input=pred, target=target) @@ -197,7 +198,7 @@ def get_loss(self, pred, target): class BCELoss(LossBase): def __init__(self, pred=None, target=None): super(BCELoss, self).__init__() - self._init_param_map(input=pred, target=target) + self._init_param_map(pred=pred, target=target) def get_loss(self, pred, target): return F.binary_cross_entropy(input=pred, target=target) @@ -205,7 +206,7 @@ def get_loss(self, pred, target): class NLLLoss(LossBase): def __init__(self, pred=None, target=None): super(NLLLoss, self).__init__() - self._init_param_map(input=pred, target=target) + self._init_param_map(pred=pred, target=target) def get_loss(self, pred, target): return F.nll_loss(input=pred, target=target) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index f8279d0a..d97ba699 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -151,9 +151,11 @@ def __call__(self, pred_dict, target_dict): if not self._checked: check_res = _check_arg_dict_list(self.evaluate, [mapped_pred_dict, mapped_target_dict]) # only check missing. + # replace missing. missing = check_res.missing replaced_missing = list(missing) for idx, func_arg in enumerate(missing): + # Don't delete `` in this information, nor add `` replaced_missing[idx] = f"{self.param_map[func_arg]}" + f"(assign to `{func_arg}` " \ f"in `{self.__class__.__name__}`)" diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 13a3490a..8f676279 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -2,7 +2,7 @@ import time from datetime import datetime from datetime import timedelta -from tqdm import tqdm +from tqdm.autonotebook import tqdm import torch from tensorboardX import SummaryWriter @@ -23,7 +23,6 @@ from fastNLP.core.utils import _check_loss_evaluate from fastNLP.core.utils import _move_dict_value_to_device from fastNLP.core.utils import get_func_signature -from fastNLP.core.utils import _relocate_pbar class Trainer(object): """Main Training Loop @@ -45,7 +44,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat :param int validate_every: step interval to do next validation. Default: -1(validate every epoch). :param DataSet dev_data: the validation data :param use_cuda: - :param str save_path: file path to save models + :param save_path: file path to save models :param Optimizer optimizer: an optimizer object :param int check_code_level: level of FastNLP code checker. -1: don't check, 0: ignore. 1: warning. 2: strict. `ignore` will not check unused field; `warning` when warn if some field are not used; `strict` means @@ -149,7 +148,7 @@ def train(self): self._mode(self.model, is_test=False) self.start_time = str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) - print("training epochs started " + self.start_time) + print("training epochs started " + self.start_time, flush=True) if self.save_path is None: class psudoSW: def __getattr__(self, item): @@ -172,12 +171,12 @@ def pass_func(*args, **kwargs): del self._summary_writer def _tqdm_train(self): + self.step = 0 data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False) total_steps = data_iterator.num_batches*self.n_epochs epoch = 1 - with tqdm(total=total_steps, postfix='loss:{0:<6.5f}', desc="Epoch {}/{}" - .format(epoch, self.n_epochs), leave=False, dynamic_ncols=True) as pbar: + with tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: ava_loss = 0 for epoch in range(1, self.n_epochs+1): pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) @@ -195,28 +194,26 @@ def _tqdm_train(self): # self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.step) # self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=self.step) if (self.step+1) % self.print_every == 0: - pbar.update(self.print_every) - pbar.set_postfix_str("loss:{0:<6.5f}".format(ava_loss/self.print_every)) + pbar.set_postfix_str("loss:{0:<6.5f}".format(ava_loss / self.print_every)) ava_loss = 0 - + pbar.update(1) self.step += 1 if self.validate_every > 0 and self.step % self.validate_every == 0 \ and self.dev_data is not None: eval_res = self._do_validation() eval_str = "Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \ self.tester._format_eval_results(eval_res) - pbar = _relocate_pbar(pbar, print_str=eval_str) + pbar.write(eval_str) if self.validate_every < 0 and self.dev_data: eval_res = self._do_validation() eval_str = "Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \ self.tester._format_eval_results(eval_res) - pbar = _relocate_pbar(pbar, print_str=eval_str) + pbar.write(eval_str) if epoch!=self.n_epochs: data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False) pbar.close() - def _print_train(self): """ @@ -264,9 +261,6 @@ def _print_train(self): self._do_validation() epoch += 1 - - - def _do_validation(self): res = self.tester.test() for name, num in res.items(): diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 0019b022..0e2bba07 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -258,29 +258,48 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re if _unused_param: unuseds.append(f"\tunused param: {_unused_param}") # output from predict or forward + module_name = '' if check_res.missing: errs.append(f"\tmissing param: {check_res.missing}") - _miss_in_dataset = [] - _miss_out_dataset = [] + import re + mapped_missing = [] + unmapped_missing = [] + input_func_map = {} for _miss in check_res.missing: + fun_arg, module_name = re.findall("(?<=`)[a-zA-Z0-9]*?(?=`)", _miss) if '(' in _miss: # if they are like 'SomeParam(assign to xxx)' _miss = _miss.split('(')[0] - if _miss in dataset: - _miss_in_dataset.append(_miss) + input_func_map[_miss] = fun_arg + if fun_arg == _miss: + unmapped_missing.append(_miss) else: - _miss_out_dataset.append(_miss) + mapped_missing.append(_miss) - if _miss_in_dataset: - suggestions.append(f"You might need to set {_miss_in_dataset} as target(Right now " - f"target is {list(target_dict.keys())}).") - if _miss_out_dataset: - _tmp = (f"You might need to provide {_miss_out_dataset} in DataSet and set it as target(Right now " - f"target has {list(target_dict.keys())}) or output it " - f"in {prev_func_signature}(Right now output has {list(pred_dict.keys())}).") - # if _unused_field: - # _tmp += f"You can use DataSet.rename_field() to rename the field in `unused field:`. " - suggestions.append(_tmp) + for _miss in mapped_missing: + if _miss in dataset: + suggestions.append(f"Set {_miss} as target.") + else: + _tmp = '' + if check_res.unused: + _tmp = f"Check key assignment for `{input_func_map[_miss]}` when initialize {module_name}." + if _tmp: + _tmp += f' Or provide {_miss} in DataSet or output of {prev_func_signature}.' + else: + _tmp = f'Provide {_miss} in DataSet or output of {prev_func_signature}.' + suggestions.append(_tmp) + for _miss in unmapped_missing: + if _miss in dataset: + suggestions.append(f"Set {_miss} as target.") + else: + _tmp = '' + if check_res.unused: + _tmp = f"Specify your assignment for `{input_func_map[_miss]}` when initialize {module_name}." + if _tmp: + _tmp += f' Or provide {_miss} in DataSet or output of {prev_func_signature}.' + else: + _tmp = f'Provide {_miss} in DataSet or output of {prev_func_signature}.' + suggestions.append(_tmp) if check_res.duplicated: errs.append(f"\tduplicated param: {check_res.duplicated}.") @@ -297,17 +316,23 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re sugg_str = "" if len(suggestions) > 1: for idx, sugg in enumerate(suggestions): - sugg_str += f'({idx+1}). {sugg}' + if idx>0: + sugg_str += '\t\t\t' + sugg_str += f'({idx+1}). {sugg}\n' + sugg_str = sugg_str[:-1] else: sugg_str += suggestions[0] + errs.append(f'\ttarget field: {list(target_dict.keys())}') + errs.append(f'\tparam from {prev_func_signature}: {list(pred_dict.keys())}') err_str = '\n' + '\n'.join(errs) + '\n\tSuggestion: ' + sugg_str raise NameError(err_str) if check_res.unused: if check_level == WARNING_CHECK_LEVEL: - _unused_warn = f'{check_res.unused} is not used by {func_signature}.' + if not module_name: + module_name = func_signature.split('.')[0] + _unused_warn = f'{check_res.unused} is not used by {module_name}.' warnings.warn(message=_unused_warn) - def _check_forward_error(forward_func, batch_x, dataset, check_level): check_res = _check_arg_dict_list(forward_func, batch_x) func_signature = get_func_signature(forward_func) @@ -402,40 +427,3 @@ def seq_mask(seq_len, max_len): seq_len = seq_len.view(-1, 1).long() # [batch_size, 1] seq_range = torch.arange(start=0, end=max_len, dtype=torch.long, device=seq_len.device).view(1, -1) # [1, max_len] return torch.gt(seq_len, seq_range) # [batch_size, max_len] - - -def _relocate_pbar(pbar:tqdm, print_str:str): - """ - - When using tqdm, you cannot print. If you print, the tqdm will duplicate. By using this function, print_str will - show above tqdm. - :param pbar: tqdm - :param print_str: - :return: - """ - - params = ['desc', 'total', 'leave', 'file', 'ncols', 'mininterval', 'maxinterval', 'miniters', 'ascii', 'disable', - 'unit', 'unit_scale', 'dynamic_ncols', 'smoothing', 'bar_format', 'initial', 'position', 'postfix', 'unit_divisor', - 'gui'] - - attr_map = {'file': 'fp', 'initial':'n', 'position':'pos'} - - param_dict = {} - for param in params: - attr_name = param - if param in attr_map: - attr_name = attr_map[param] - value = getattr(pbar, attr_name) - if attr_name == 'pos': - value = abs(value) - param_dict[param] = value - - pbar.close() - avg_time = pbar.avg_time - start_t = pbar.start_t - print(print_str) - pbar = tqdm(**param_dict) - pbar.start_t = start_t - pbar.avg_time = avg_time - pbar.sp(pbar.__repr__()) - return pbar \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 60ab7849..45c84bc2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ numpy>=1.14.2 torch>=0.4.0 tensorboardX -tqdm \ No newline at end of file +tqdm>=4.28.1 \ No newline at end of file diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index 493a740c..fe58b2f2 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -142,9 +142,16 @@ def test_apply2(self): def split_sent(ins): return ins['raw_sentence'].split() dataset = DataSet.read_csv('../../sentence.csv', headers=('raw_sentence', 'label'), sep='\t') - dataset.apply(split_sent, new_field_name='words') + dataset.drop(lambda x:len(x['raw_sentence'].split())==0) + dataset.apply(split_sent, new_field_name='words', is_input=True) # print(dataset) + def test_add_field(self): + ds = DataSet({"x": [3, 4]}) + ds.add_field('y', [['hello', 'world'], ['this', 'is', 'a', 'test']], is_input=True, is_target=True) + # ds.apply(lambda x:[x['x']]*3, is_input=True, is_target=True, new_field_name='y') + print(ds) + def test_save_load(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) ds.save("./my_ds.pkl") diff --git a/test/core/test_tester.py b/test/core/test_tester.py index 68143f7b..99a8000e 100644 --- a/test/core/test_tester.py +++ b/test/core/test_tester.py @@ -4,6 +4,64 @@ pickle_path = "data_for_tests" +import numpy as np +import torch.nn.functional as F +from torch import nn +import time +from fastNLP.core.utils import CheckError +from fastNLP.core.dataset import DataSet +from fastNLP.core.instance import Instance +from fastNLP.core.losses import BCELoss +from fastNLP.core.losses import CrossEntropyLoss +from fastNLP.core.metrics import AccuracyMetric +from fastNLP.core.optimizer import SGD +from fastNLP.core.tester import Tester +from fastNLP.models.base_model import NaiveClassifier + +def prepare_fake_dataset(): + mean = np.array([-3, -3]) + cov = np.array([[1, 0], [0, 1]]) + class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) + + mean = np.array([3, 3]) + cov = np.array([[1, 0], [0, 1]]) + class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) + + data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + + [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) + return data_set + + +def prepare_fake_dataset2(*args, size=100): + ys = np.random.randint(4, size=100, dtype=np.int64) + data = {'y': ys} + for arg in args: + data[arg] = np.random.randn(size, 5) + return DataSet(data=data) + class TestTester(unittest.TestCase): def test_case_1(self): - pass + # 检查报错提示能否正确提醒用户 + # 这里传入多余参数,让其duplicate + dataset = prepare_fake_dataset2('x1', 'x_unused') + dataset.rename_field('x_unused', 'x2') + dataset.set_input('x1', 'x2') + dataset.set_target('y', 'x1') + class Model(nn.Module): + def __init__(self): + super().__init__() + self.fc = nn.Linear(5, 4) + def forward(self, x1, x2): + x1 = self.fc(x1) + x2 = self.fc(x2) + x = x1 + x2 + time.sleep(0.1) + # loss = F.cross_entropy(x, y) + return {'preds': x} + + model = Model() + tester = Tester( + data=dataset, + model=model, + metrics=AccuracyMetric()) + tester.test() diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index 38fb6e0e..a69438ae 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -3,7 +3,7 @@ import numpy as np import torch.nn.functional as F from torch import nn - +import time from fastNLP.core.utils import CheckError from fastNLP.core.dataset import DataSet from fastNLP.core.instance import Instance @@ -212,8 +212,8 @@ def test_trainer_suggestion6(self): # 这里传入多余参数,让其duplicate dataset = prepare_fake_dataset2('x1', 'x_unused') dataset.rename_field('x_unused', 'x2') - dataset.set_input('x1', 'x2', 'y') - dataset.set_target('x1', 'x2') + dataset.set_input('x1', 'x2') + dataset.set_target('y', 'x1') class Model(nn.Module): def __init__(self): super().__init__() @@ -222,8 +222,9 @@ def forward(self, x1, x2): x1 = self.fc(x1) x2 = self.fc(x2) x = x1 + x2 + time.sleep(0.1) # loss = F.cross_entropy(x, y) - return {'pred': x} + return {'preds': x} model = Model() trainer = Trainer( diff --git a/tutorials/fastnlp_tutorial_1204.ipynb b/tutorials/fastnlp_tutorial_1204.ipynb index 1fa1adca..8d896bf2 100644 --- a/tutorials/fastnlp_tutorial_1204.ipynb +++ b/tutorials/fastnlp_tutorial_1204.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -34,17 +34,9 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "8529\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from fastNLP import DataSet\n", "from fastNLP import Instance\n", @@ -56,20 +48,9 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n", - "'label': 1}\n", - "{'raw_sentence': -LRB- Tries -RRB- to parody a genre that 's already a joke in the United States .,\n", - "'label': 1}\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# 使用数字索引[k],获取第k个样本\n", "print(dataset[0])\n", @@ -90,21 +71,9 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': fake data,\n", - "'label': 0}" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# DataSet.append(Instance)加入新数据\n", "dataset.append(Instance(raw_sentence='fake data', label='0'))\n", @@ -121,18 +90,9 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'raw_sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n", - "'label': 1}\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# 将所有数字转为小写\n", "dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')\n", @@ -141,18 +101,9 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'raw_sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n", - "'label': 1}\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# label转int\n", "dataset.apply(lambda x: int(x['label']), new_field_name='label')\n", @@ -161,28 +112,9 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "ename": "RuntimeError", - "evalue": "Cannot create FieldArray with an empty list.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0msplit_sent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mins\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mins\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'raw_sentence'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mdataset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msplit_sent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnew_field_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'words'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/fastNLP/fastNLP/fastNLP/core/dataset.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, new_field_name, **kwargs)\u001b[0m\n\u001b[1;32m 265\u001b[0m **extra_param)\n\u001b[1;32m 266\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 267\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_field\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnew_field_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mextra_param\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 268\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 269\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/fastNLP/fastNLP/fastNLP/core/dataset.py\u001b[0m in \u001b[0;36madd_field\u001b[0;34m(self, name, fields, padding_val, is_input, is_target)\u001b[0m\n\u001b[1;32m 158\u001b[0m f\"Dataset size {len(self)} != field size {len(fields)}\")\n\u001b[1;32m 159\u001b[0m self.field_arrays[name] = FieldArray(name, fields, padding_val=padding_val, is_target=is_target,\n\u001b[0;32m--> 160\u001b[0;31m is_input=is_input)\n\u001b[0m\u001b[1;32m 161\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdelete_field\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/fastNLP/fastNLP/fastNLP/core/fieldarray.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, content, padding_val, is_target, is_input)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_input\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 38\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_input\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mis_input\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 39\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_target\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_target\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mis_target\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/fastNLP/fastNLP/fastNLP/core/fieldarray.py\u001b[0m in \u001b[0;36mis_input\u001b[0;34m(self, value)\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mis_input\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetter\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mis_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 48\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpytype\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_type_detection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 49\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_map_to_np_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpytype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_is_input\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/fastNLP/fastNLP/fastNLP/core/fieldarray.py\u001b[0m in \u001b[0;36m_type_detection\u001b[0;34m(self, content)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcontent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# strict check 2-D list\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Please provide 2-D list.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0mtype_set\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_type_detection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcontent\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype_set\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mint\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtype_set\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mfloat\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtype_set\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0mtype_set\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/fastNLP/fastNLP/fastNLP/core/fieldarray.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcontent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# strict check 2-D list\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Please provide 2-D list.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0mtype_set\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_type_detection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcontent\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype_set\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mint\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtype_set\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mfloat\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtype_set\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0mtype_set\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/fastNLP/fastNLP/fastNLP/core/fieldarray.py\u001b[0m in \u001b[0;36m_type_detection\u001b[0;34m(self, content)\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0;31m# content is a 1-D list\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontent\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 84\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Cannot create FieldArray with an empty list.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 85\u001b[0m \u001b[0mtype_set\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mitem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcontent\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mRuntimeError\u001b[0m: Cannot create FieldArray with an empty list." - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# 使用空格分割句子\n", "def split_sent(ins):\n", @@ -193,20 +125,9 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'raw_sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n", - "'label': 1,\n", - "'words': ['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.'],\n", - "'seq_len': 37}\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# 增加长度信息\n", "dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')\n", @@ -223,17 +144,9 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "38\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "dataset.drop(lambda x: x['seq_len'] <= 3)\n", "print(len(dataset))" @@ -250,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -264,18 +177,9 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "27\n", - "11" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# 分出测试集、训练集\n", "\n", @@ -296,20 +200,9 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'raw_sentence': that the chuck norris `` grenade gag '' occurs about 7 times during windtalkers is a good indication of how serious-minded the film is .,\n", - "'label': 2,\n", - "'words': [6, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 8, 24, 1, 5, 1, 1, 2, 15, 10, 3],\n", - "'seq_len': 25}\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from fastNLP import Vocabulary\n", "\n", @@ -336,36 +229,9 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "CNNText(\n", - " (embed): Embedding(\n", - " (embed): Embedding(32, 50, padding_idx=0)\n", - " (dropout): Dropout(p=0.0)\n", - " )\n", - " (conv_pool): ConvMaxpool(\n", - " (convs): ModuleList(\n", - " (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n", - " (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n", - " (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n", - " )\n", - " )\n", - " (dropout): Dropout(p=0.1)\n", - " (fc): Linear(\n", - " (linear): Linear(in_features=12, out_features=5, bias=True)\n", - " )\n", - ")" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from fastNLP.models import CNNText\n", "model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)\n", @@ -432,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -469,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -492,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -501,94 +367,9 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "training epochs started 2018-12-04 22:51:24\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1/5. Step:1/5. AccuracyMetric: acc=0.296296\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 2/5. Step:2/5. AccuracyMetric: acc=0.407407\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 3/5. Step:3/5. AccuracyMetric: acc=0.518519\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 4/5. Step:4/5. AccuracyMetric: acc=0.481481\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 5/5. Step:5/5. AccuracyMetric: acc=0.592593\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# 实例化Trainer,传入模型和数据,进行训练\n", "# 先在test_data拟合\n", @@ -604,101 +385,9 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "training epochs started 2018-12-04 22:52:01\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1/5. Step:1/5. AccuracyMetric: acc=0.296296\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 2/5. Step:2/5. AccuracyMetric: acc=0.222222\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 3/5. Step:3/5. AccuracyMetric: acc=0.259259\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 4/5. Step:4/5. AccuracyMetric: acc=0.296296\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 5/5. Step:5/5. AccuracyMetric: acc=0.259259\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train finished!\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# 用train_data训练,在test_data验证\n", "trainer = Trainer(model=model, train_data=train_data, dev_data=test_data,\n", @@ -713,19 +402,9 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[tester] \n", - "AccuracyMetric: acc=0.259259\n", - "{'AccuracyMetric': {'acc': 0.259259}}\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# 调用Tester在test_data上评价效果\n", "from fastNLP import Tester\n", From aea931812b75aa56106996906f647a1ac341aa30 Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 5 Dec 2018 20:23:40 +0800 Subject: [PATCH 59/67] =?UTF-8?q?1.=20trainer=E4=B8=ADlosser=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=B8=BAloss?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/trainer.py | 6 +++--- fastNLP/core/utils.py | 1 - test/core/test_tester.py | 12 ++++++------ test/core/test_trainer.py | 19 ++++++++++--------- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 8f676279..45055be5 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -28,7 +28,7 @@ class Trainer(object): """Main Training Loop """ - def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, batch_size=32, print_every=50, + def __init__(self, train_data, model, loss=None, metrics=None, n_epochs=3, batch_size=32, print_every=50, validate_every=-1, dev_data=None, use_cuda=False, save_path=None, optimizer=Adam(lr=0.01, weight_decay=0), check_code_level=0, metric_key=None, sampler=RandomSampler(), use_tqdm=True): @@ -36,7 +36,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat :param DataSet train_data: the training data :param torch.nn.modules.module model: a PyTorch model - :param LossBase losser: a loss object + :param LossBase loss: a loss object :param MetricBase or List[MetricBase] metrics: a metric object or a list of metrics :param int n_epochs: the number of training epochs :param int batch_size: batch size for training and validation @@ -88,7 +88,7 @@ def __init__(self, train_data, model, losser=None, metrics=None, n_epochs=3, bat self.metric_key = None # prepare loss - losser = _prepare_losser(losser) + losser = _prepare_losser(loss) # sampler check if not isinstance(sampler, BaseSampler): diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 0e2bba07..508d5587 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -7,7 +7,6 @@ import numpy as np import torch -from tqdm import tqdm CheckRes = namedtuple('CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', 'varargs'], verbose=False) diff --git a/test/core/test_tester.py b/test/core/test_tester.py index 99a8000e..d606c0b8 100644 --- a/test/core/test_tester.py +++ b/test/core/test_tester.py @@ -42,7 +42,6 @@ def prepare_fake_dataset2(*args, size=100): class TestTester(unittest.TestCase): def test_case_1(self): # 检查报错提示能否正确提醒用户 - # 这里传入多余参数,让其duplicate dataset = prepare_fake_dataset2('x1', 'x_unused') dataset.rename_field('x_unused', 'x2') dataset.set_input('x1', 'x2') @@ -60,8 +59,9 @@ def forward(self, x1, x2): return {'preds': x} model = Model() - tester = Tester( - data=dataset, - model=model, - metrics=AccuracyMetric()) - tester.test() + with self.assertRaises(NameError): + tester = Tester( + data=dataset, + model=model, + metrics=AccuracyMetric()) + tester.test() diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index a69438ae..6f6fbbf3 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -48,7 +48,7 @@ def test_case(self): model = NaiveClassifier(2, 1) trainer = Trainer(train_set, model, - losser=BCELoss(pred="predict", target="y"), + loss=BCELoss(pred="predict", target="y"), metrics=AccuracyMetric(pred="predict", target="y"), n_epochs=10, batch_size=32, @@ -227,14 +227,15 @@ def forward(self, x1, x2): return {'preds': x} model = Model() - trainer = Trainer( - train_data=dataset, - model=model, - dev_data=dataset, - losser=CrossEntropyLoss(), - metrics=AccuracyMetric(), - use_tqdm=False, - print_every=2) + with self.assertRaises(NameError): + trainer = Trainer( + train_data=dataset, + model=model, + dev_data=dataset, + loss=CrossEntropyLoss(), + metrics=AccuracyMetric(), + use_tqdm=False, + print_every=2) def test_case2(self): # check metrics Wrong From 6129a31c1de1c4aeef8041b9bd69038d8896d622 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Thu, 6 Dec 2018 10:07:45 +0800 Subject: [PATCH 60/67] * fix tests * clean up unused codes --- fastNLP/core/fieldarray.py | 4 +- fastNLP/core/metrics.py | 2 +- fastNLP/core/predictor.py | 17 - fastNLP/core/trainer.py | 10 +- test/core/__init__.py | 0 test/core/test_dataset.py | 8 +- test/core/test_fieldarray.py | 10 +- test/core/test_loss.py | 29 +- test/core/test_metrics.py | 4 +- test/core/test_optimizer.py | 8 + test/test_tutorial.py | 4 +- tutorials/fastnlp_tutorial_1203.ipynb | 526 ++++++++++++++++++++++++++ 12 files changed, 561 insertions(+), 61 deletions(-) delete mode 100644 test/core/__init__.py create mode 100644 tutorials/fastnlp_tutorial_1203.ipynb diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index 5167be35..5fa8276e 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -162,7 +162,7 @@ def get(self, indices): if self.is_input is False and self.is_target is False: raise RuntimeError("Please specify either is_input or is_target is True for {}".format(self.name)) batch_size = len(indices) - # TODO 当这个fieldArray是seq_length这种只有一位的内容时,不需要padding,需要再讨论一下 + if not is_iterable(self.content[0]): array = np.array([self.content[i] for i in indices], dtype=self.dtype) elif self.dtype in (np.int64, np.float64): @@ -170,7 +170,7 @@ def get(self, indices): array = np.full((batch_size, max_len), self.padding_val, dtype=self.dtype) for i, idx in enumerate(indices): array[i][:len(self.content[idx])] = self.content[idx] - else: # should only be str + else: # should only be str array = np.array([self.content[i] for i in indices]) return array diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index d97ba699..32c2306f 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -467,7 +467,7 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary'): precision = precision_score(y_true, y_pred, labels=labels, pos_label=pos_label, average=average) recall = recall_score(y_true, y_pred, labels=labels, pos_label=pos_label, average=average) if isinstance(precision, np.ndarray): - res = 2 * precision * recall / (precision + recall) + res = 2 * precision * recall / (precision + recall + 1e-10) res[(precision + recall) <= 0] = 0 return res return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py index 9ce1d792..de9ddc8c 100644 --- a/fastNLP/core/predictor.py +++ b/fastNLP/core/predictor.py @@ -1,4 +1,3 @@ -import numpy as np import torch from fastNLP.core.batch import Batch @@ -48,19 +47,3 @@ def data_forward(self, network, x): """Forward through network.""" y = network(**x) return y - - -def seq_label_post_processor(batch_outputs, label_vocab): - results = [] - for batch in batch_outputs: - for example in np.array(batch): - results.append([label_vocab.to_word(int(x)) for x in example]) - return results - - -def text_classify_post_processor(batch_outputs, label_vocab): - results = [] - for batch_out in batch_outputs: - idx = np.argmax(batch_out.detach().numpy(), axis=-1) - results.extend([label_vocab.to_word(i) for i in idx]) - return results diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 45055be5..a3f81c00 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -2,11 +2,11 @@ import time from datetime import datetime from datetime import timedelta -from tqdm.autonotebook import tqdm import torch from tensorboardX import SummaryWriter from torch import nn +from tqdm.autonotebook import tqdm from fastNLP.core.batch import Batch from fastNLP.core.dataset import DataSet @@ -24,6 +24,7 @@ from fastNLP.core.utils import _move_dict_value_to_device from fastNLP.core.utils import get_func_signature + class Trainer(object): """Main Training Loop @@ -263,8 +264,10 @@ def _print_train(self): def _do_validation(self): res = self.tester.test() - for name, num in res.items(): - self._summary_writer.add_scalar("valid_{}".format(name), num, global_step=self.step) + for name, metric in res.items(): + for metric_key, metric_val in metric.items(): + self._summary_writer.add_scalar("valid_{}_{}".format(name, metric_key), metric_val, + global_step=self.step) if self.save_path is not None and self._better_eval_result(res): metric_key = self.metric_key if self.metric_key is not None else "None" self._save_model(self.model, @@ -386,6 +389,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ f"should be torch.size([])") loss.backward() except CheckError as e: + # TODO: another error raised if CheckError caught pre_func_signature = get_func_signature(model.forward) _check_loss_evaluate(prev_func_signature=pre_func_signature, func_signature=e.func_signature, check_res=e.check_res, pred_dict=pred_dict, target_dict=batch_y, diff --git a/test/core/__init__.py b/test/core/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index fe58b2f2..9527e8ee 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -141,8 +141,10 @@ def test_get_target_name(self): def test_apply2(self): def split_sent(ins): return ins['raw_sentence'].split() - dataset = DataSet.read_csv('../../sentence.csv', headers=('raw_sentence', 'label'), sep='\t') - dataset.drop(lambda x:len(x['raw_sentence'].split())==0) + + dataset = DataSet.read_csv('test/data_for_tests/tutorial_sample_dataset.csv', headers=('raw_sentence', 'label'), + sep='\t') + dataset.drop(lambda x: len(x['raw_sentence'].split()) == 0) dataset.apply(split_sent, new_field_name='words', is_input=True) # print(dataset) @@ -160,9 +162,9 @@ def test_save_load(self): ds_1 = DataSet.load("./my_ds.pkl") os.remove("my_ds.pkl") + class TestDataSetIter(unittest.TestCase): def test__repr__(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) for iter in ds: self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4],\n'y': [5, 6]}") - diff --git a/test/core/test_fieldarray.py b/test/core/test_fieldarray.py index c0b8a592..1204cda5 100644 --- a/test/core/test_fieldarray.py +++ b/test/core/test_fieldarray.py @@ -31,18 +31,18 @@ def test_type_conversion(self): self.assertEqual(fa.pytype, float) self.assertEqual(fa.dtype, np.float64) - fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=False) + fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True) fa.append(10) self.assertEqual(fa.pytype, float) self.assertEqual(fa.dtype, np.float64) - fa = FieldArray("y", ["a", "b", "c", "d"], is_input=False) + fa = FieldArray("y", ["a", "b", "c", "d"], is_input=True) fa.append("e") self.assertEqual(fa.dtype, np.str) self.assertEqual(fa.pytype, str) def test_support_np_array(self): - fa = FieldArray("y", [np.array([1.1, 2.2, 3.3, 4.4, 5.5])], is_input=False) + fa = FieldArray("y", [np.array([1.1, 2.2, 3.3, 4.4, 5.5])], is_input=True) self.assertEqual(fa.dtype, np.ndarray) self.assertEqual(fa.pytype, np.ndarray) @@ -50,12 +50,12 @@ def test_support_np_array(self): self.assertEqual(fa.dtype, np.ndarray) self.assertEqual(fa.pytype, np.ndarray) - fa = FieldArray("my_field", np.random.rand(3, 5), is_input=False) + fa = FieldArray("my_field", np.random.rand(3, 5), is_input=True) # in this case, pytype is actually a float. We do not care about it. self.assertEqual(fa.dtype, np.float64) def test_nested_list(self): - fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.1, 2.2, 3.3, 4.4, 5.5]], is_input=False) + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.1, 2.2, 3.3, 4.4, 5.5]], is_input=True) self.assertEqual(fa.pytype, float) self.assertEqual(fa.dtype, np.float64) diff --git a/test/core/test_loss.py b/test/core/test_loss.py index 22f11234..a7c303e2 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -6,7 +6,6 @@ import torch.nn.functional as F import fastNLP.core.losses as loss -from fastNLP.core.losses import LossFunc class TestLoss(unittest.TestCase): @@ -245,31 +244,7 @@ def test_case_7(self): self.assertEqual(int(los * 1000), int(r * 1000)) def test_case_8(self): - def func(a, b): - return F.cross_entropy(a, b) - - def func2(a, truth): - return func(a, truth) - - def func3(predict, truth): - return func(predict, truth) - - def func4(a, b, c=2): - return (a + b) * c - - def func6(a, b, **kwargs): - c = kwargs['c'] - return (a + b) * c - - get_loss = LossFunc(func, {'a': 'predict', 'b': 'truth'}) - predict = torch.randn(5, 3) - truth = torch.LongTensor([1, 0, 1, 2, 1]) - loss1 = get_loss({'predict': predict}, {'truth': truth}) - get_loss_2 = LossFunc(func2, {'a': 'predict'}) - loss2 = get_loss_2({'predict': predict}, {'truth': truth}) - get_loss_3 = LossFunc(func3) - loss3 = get_loss_3({'predict': predict}, {'truth': truth}) - assert loss1 == loss2 and loss1 == loss3 + pass class TestLoss_v2(unittest.TestCase): @@ -317,7 +292,7 @@ def test_losser2(self): target_dict = {'target': torch.zeros(16, 3).long()} los = loss.CrossEntropyLoss() - print(los(pred_dict=pred_dict, target_dict=target_dict)) + # print(los(pred_dict=pred_dict, target_dict=target_dict)) def test_losser3(self): # (2) with corrupted size diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py index 9286a26f..d2e45379 100644 --- a/test/core/test_metrics.py +++ b/test/core/test_metrics.py @@ -4,7 +4,7 @@ import torch from fastNLP.core.metrics import AccuracyMetric -from fastNLP.core.metrics import accuracy_score, recall_score, precision_score, f1_score +from fastNLP.core.metrics import accuracy_score, recall_score, precision_score, f1_score, pred_topk, accuracy_topk class TestAccuracyMetric(unittest.TestCase): @@ -143,5 +143,7 @@ def test_case_1(self): _ = precision_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None) _ = recall_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None) _ = f1_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None) + _ = accuracy_topk(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), k=3) + _ = pred_topk(np.random.randint(0, 3, size=(10, 1))) # 跑通即可 diff --git a/test/core/test_optimizer.py b/test/core/test_optimizer.py index 7b29b826..8ffa1a72 100644 --- a/test/core/test_optimizer.py +++ b/test/core/test_optimizer.py @@ -10,9 +10,13 @@ def test_SGD(self): optim = SGD(torch.nn.Linear(10, 3).parameters()) self.assertTrue("lr" in optim.__dict__["settings"]) self.assertTrue("momentum" in optim.__dict__["settings"]) + res = optim.construct_from_pytorch(torch.nn.Linear(10, 3).parameters()) + self.assertTrue(isinstance(res, torch.optim.SGD)) optim = SGD(lr=0.001) self.assertEqual(optim.__dict__["settings"]["lr"], 0.001) + res = optim.construct_from_pytorch(torch.nn.Linear(10, 3).parameters()) + self.assertTrue(isinstance(res, torch.optim.SGD)) optim = SGD(lr=0.002, momentum=0.989) self.assertEqual(optim.__dict__["settings"]["lr"], 0.002) @@ -27,9 +31,13 @@ def test_Adam(self): optim = Adam(torch.nn.Linear(10, 3).parameters()) self.assertTrue("lr" in optim.__dict__["settings"]) self.assertTrue("weight_decay" in optim.__dict__["settings"]) + res = optim.construct_from_pytorch(torch.nn.Linear(10, 3).parameters()) + self.assertTrue(isinstance(res, torch.optim.Adam)) optim = Adam(lr=0.001) self.assertEqual(optim.__dict__["settings"]["lr"], 0.001) + res = optim.construct_from_pytorch(torch.nn.Linear(10, 3).parameters()) + self.assertTrue(isinstance(res, torch.optim.Adam)) optim = Adam(lr=0.002, weight_decay=0.989) self.assertEqual(optim.__dict__["settings"]["lr"], 0.002) diff --git a/test/test_tutorial.py b/test/test_tutorial.py index f3648b4f..68cb6a41 100644 --- a/test/test_tutorial.py +++ b/test/test_tutorial.py @@ -72,13 +72,13 @@ def split_sent(ins): # 实例化Trainer,传入模型和数据,进行训练 copy_model = deepcopy(model) overfit_trainer = Trainer(train_data=test_data, model=copy_model, - losser=CrossEntropyLoss(pred="output", target="label_seq"), + loss=CrossEntropyLoss(pred="output", target="label_seq"), metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4, dev_data=test_data, save_path="./save") overfit_trainer.train() trainer = Trainer(train_data=train_data, model=model, - losser=CrossEntropyLoss(pred="output", target="label_seq"), + loss=CrossEntropyLoss(pred="output", target="label_seq"), metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4, dev_data=test_data, save_path="./save") trainer.train() diff --git a/tutorials/fastnlp_tutorial_1203.ipynb b/tutorials/fastnlp_tutorial_1203.ipynb new file mode 100644 index 00000000..cb8fa6a0 --- /dev/null +++ b/tutorials/fastnlp_tutorial_1203.ipynb @@ -0,0 +1,526 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "fastNLP上手教程\n", + "-------\n", + "\n", + "fastNLP提供方便的数据预处理,训练和测试模型的功能" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/yh/miniconda2/envs/python3/lib/python3.6/site-packages/tqdm/autonotebook/__init__.py:14: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", + " \" (e.g. in jupyter console)\", TqdmExperimentalWarning)\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.path.append('/Users/yh/Desktop/fastNLP/fastNLP/')\n", + "\n", + "import fastNLP as fnlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DataSet & Instance\n", + "------\n", + "\n", + "fastNLP用DataSet和Instance保存和处理数据。每个DataSet表示一个数据集,每个Instance表示一个数据样本。一个DataSet存有多个Instance,每个Instance可以自定义存哪些内容。\n", + "\n", + "有一些read_*方法,可以轻松从文件读取数据,存成DataSet。" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n", + "'label': 1}\n" + ] + } + ], + "source": [ + "from fastNLP import DataSet\n", + "from fastNLP import Instance\n", + "\n", + "# 从csv读取数据到DataSet\n", + "dataset = DataSet.read_csv('sentence.csv', headers=('raw_sentence', 'label'), sep='\\t')\n", + "print(dataset[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'raw_sentence': fake data,\n", + "'label': 0}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DataSet.append(Instance)加入新数据\n", + "\n", + "dataset.append(Instance(raw_sentence='fake data', label='0'))\n", + "dataset[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# DataSet.apply(func, new_field_name)对数据预处理\n", + "\n", + "# 将所有数字转为小写\n", + "dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')\n", + "# label转int\n", + "dataset.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True)\n", + "# 使用空格分割句子\n", + "dataset.drop(lambda x:len(x['raw_sentence'].split())==0)\n", + "def split_sent(ins):\n", + " return ins['raw_sentence'].split()\n", + "dataset.apply(split_sent, new_field_name='words', is_input=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# DataSet.drop(func)筛除数据\n", + "# 删除低于某个长度的词语\n", + "# dataset.drop(lambda x: len(x['words']) <= 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train size: 5971\n", + "Test size: 2558\n" + ] + } + ], + "source": [ + "# 分出测试集、训练集\n", + "\n", + "test_data, train_data = dataset.split(0.3)\n", + "print(\"Train size: \", len(test_data))\n", + "print(\"Test size: \", len(train_data))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Vocabulary\n", + "------\n", + "\n", + "fastNLP中的Vocabulary轻松构建词表,将词转成数字" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': gussied up with so many distracting special effects and visual party tricks that it 's not clear whether we 're supposed to shriek or laugh .,\n", + "'label': 1,\n", + "'label_seq': 1,\n", + "'words': ['gussied', 'up', 'with', 'so', 'many', 'distracting', 'special', 'effects', 'and', 'visual', 'party', 'tricks', 'that', 'it', \"'s\", 'not', 'clear', 'whether', 'we', \"'re\", 'supposed', 'to', 'shriek', 'or', 'laugh', '.'],\n", + "'word_seq': [1, 65, 16, 43, 108, 1, 329, 433, 7, 319, 1313, 1, 12, 10, 11, 27, 1428, 567, 86, 134, 1949, 8, 1, 49, 506, 2]}\n" + ] + } + ], + "source": [ + "from fastNLP import Vocabulary\n", + "\n", + "# 构建词表, Vocabulary.add(word)\n", + "vocab = Vocabulary(min_freq=2)\n", + "train_data.apply(lambda x: [vocab.add(word) for word in x['words']])\n", + "vocab.build_vocab()\n", + "\n", + "# index句子, Vocabulary.to_index(word)\n", + "train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True)\n", + "test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True)\n", + "\n", + "\n", + "print(test_data[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "batch_x has: {'words': array([list(['this', 'kind', 'of', 'hands-on', 'storytelling', 'is', 'ultimately', 'what', 'makes', 'shanghai', 'ghetto', 'move', 'beyond', 'a', 'good', ',', 'dry', ',', 'reliable', 'textbook', 'and', 'what', 'allows', 'it', 'to', 'rank', 'with', 'its', 'worthy', 'predecessors', '.']),\n", + " list(['the', 'entire', 'movie', 'is', 'filled', 'with', 'deja', 'vu', 'moments', '.'])],\n", + " dtype=object), 'word_seq': tensor([[ 19, 184, 6, 1, 481, 9, 206, 50, 91, 1210, 1609, 1330,\n", + " 495, 5, 63, 4, 1269, 4, 1, 1184, 7, 50, 1050, 10,\n", + " 8, 1611, 16, 21, 1039, 1, 2],\n", + " [ 3, 711, 22, 9, 1282, 16, 2482, 2483, 200, 2, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0]])}\n", + "batch_y has: {'label_seq': tensor([3, 2])}\n" + ] + } + ], + "source": [ + "# 假设你们需要做强化学习或者gan之类的项目,也许你们可以使用这里的dataset\n", + "from fastNLP.core.batch import Batch\n", + "from fastNLP.core.sampler import RandomSampler\n", + "\n", + "batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler())\n", + "for batch_x, batch_y in batch_iterator:\n", + " print(\"batch_x has: \", batch_x)\n", + " print(\"batch_y has: \", batch_y)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CNNText(\n", + " (embed): Embedding(\n", + " (embed): Embedding(3470, 50, padding_idx=0)\n", + " (dropout): Dropout(p=0.0)\n", + " )\n", + " (conv_pool): ConvMaxpool(\n", + " (convs): ModuleList(\n", + " (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n", + " (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n", + " (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n", + " )\n", + " )\n", + " (dropout): Dropout(p=0.1)\n", + " (fc): Linear(\n", + " (linear): Linear(in_features=12, out_features=5, bias=True)\n", + " )\n", + ")" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 定义一个简单的Pytorch模型\n", + "\n", + "from fastNLP.models import CNNText\n", + "model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)\n", + "model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Trainer & Tester\n", + "------\n", + "\n", + "使用fastNLP的Trainer训练模型" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from fastNLP import Trainer\n", + "from copy import deepcopy\n", + "from fastNLP.core.losses import CrossEntropyLoss\n", + "from fastNLP.core.metrics import AccuracyMetric" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "training epochs started 2018-12-05 15:37:15\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=1870), HTML(value='')), layout=Layout(display…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10. Step:187/1870. AccuracyMetric: acc=0.351365\n", + "Epoch 2/10. Step:374/1870. AccuracyMetric: acc=0.470943\n", + "Epoch 3/10. Step:561/1870. AccuracyMetric: acc=0.600402\n", + "Epoch 4/10. Step:748/1870. AccuracyMetric: acc=0.702227\n", + "Epoch 5/10. Step:935/1870. AccuracyMetric: acc=0.79099\n", + "Epoch 6/10. Step:1122/1870. AccuracyMetric: acc=0.846424\n", + "Epoch 7/10. Step:1309/1870. AccuracyMetric: acc=0.874058\n", + "Epoch 8/10. Step:1496/1870. AccuracyMetric: acc=0.898844\n", + "Epoch 9/10. Step:1683/1870. AccuracyMetric: acc=0.910568\n", + "Epoch 10/10. Step:1870/1870. AccuracyMetric: acc=0.921286\n", + "\r" + ] + } + ], + "source": [ + "# 进行overfitting测试\n", + "copy_model = deepcopy(model)\n", + "overfit_trainer = Trainer(model=copy_model, \n", + " train_data=test_data, \n", + " dev_data=test_data,\n", + " losser=CrossEntropyLoss(pred=\"output\", target=\"label_seq\"),\n", + " metrics=AccuracyMetric(),\n", + " n_epochs=10,\n", + " save_path=None)\n", + "overfit_trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "training epochs started 2018-12-05 15:37:41\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=400), HTML(value='')), layout=Layout(display=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "ename": "AttributeError", + "evalue": "'NoneType' object has no attribute 'squeeze'", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mn_epochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m save_path='save/')\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Train finished!'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Desktop/fastNLP/fastNLP/fastNLP/core/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_summary_writer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSummaryWriter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_tqdm\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 165\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tqdm_train\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 166\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_print_train\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Desktop/fastNLP/fastNLP/fastNLP/core/trainer.py\u001b[0m in \u001b[0;36m_tqdm_train\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[0mpbar\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0meval_str\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalidate_every\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdev_data\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 208\u001b[0;31m \u001b[0meval_res\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_do_validation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 209\u001b[0m \u001b[0meval_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Epoch {}/{}. Step:{}/{}. \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mepoch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_epochs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtotal_steps\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtester\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_format_eval_results\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0meval_res\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Desktop/fastNLP/fastNLP/fastNLP/core/trainer.py\u001b[0m in \u001b[0;36m_do_validation\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtester\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 266\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 267\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_summary_writer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"valid_{}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mglobal_step\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 268\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_path\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_better_eval_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 269\u001b[0m \u001b[0mmetric_key\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetric_key\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetric_key\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m\"None\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda2/envs/python3/lib/python3.6/site-packages/tensorboardX/writer.py\u001b[0m in \u001b[0;36madd_scalar\u001b[0;34m(self, tag, scalar_value, global_step, walltime)\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_caffe2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscalar_value\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[0mscalar_value\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mworkspace\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFetchBlob\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscalar_value\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 334\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfile_writer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_summary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtag\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscalar_value\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mglobal_step\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwalltime\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 335\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0madd_scalars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmain_tag\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtag_scalar_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mglobal_step\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwalltime\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda2/envs/python3/lib/python3.6/site-packages/tensorboardX/summary.py\u001b[0m in \u001b[0;36mscalar\u001b[0;34m(name, scalar, collections)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[0mname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_clean_tag\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[0mscalar\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_np\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscalar\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 117\u001b[0;31m \u001b[0;32massert\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscalar\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msqueeze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'scalar should be 0D'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 118\u001b[0m \u001b[0mscalar\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscalar\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mSummary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mSummary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mValue\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtag\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msimple_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscalar\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'squeeze'" + ], + "output_type": "error" + } + ], + "source": [ + "# 实例化Trainer,传入模型和数据,进行训练\n", + "trainer = Trainer(model=model, \n", + " train_data=train_data, \n", + " dev_data=test_data,\n", + " losser=CrossEntropyLoss(pred=\"output\", target=\"label_seq\"),\n", + " metrics=AccuracyMetric(),\n", + " n_epochs=5,\n", + " save_path='save/')\n", + "trainer.train()\n", + "print('Train finished!')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fastNLP import Tester\n", + "\n", + "tester = Tester(data=test_data, model=model, metrics=AccuracyMetric())\n", + "acc = tester.test()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# In summary\n", + "\n", + "## fastNLP Trainer的伪代码逻辑\n", + "### 1. 准备DataSet,假设DataSet中共有如下的fields\n", + " ['raw_sentence', 'word_seq1', 'word_seq2', 'raw_label','label']\n", + " 通过\n", + " DataSet.set_input('word_seq1', word_seq2', flag=True)将'word_seq1', 'word_seq2'设置为input\n", + " 通过\n", + " DataSet.set_target('label', flag=True)将'label'设置为target\n", + "### 2. 初始化模型\n", + " class Model(nn.Module):\n", + " def __init__(self):\n", + " xxx\n", + " def forward(self, word_seq1, word_seq2):\n", + " # (1) 这里使用的形参名必须和DataSet中的input field的名称对应。因为我们是通过形参名, 进行赋值的\n", + " # (2) input field的数量可以多于这里的形参数量。但是不能少于。\n", + " xxxx\n", + " # 输出必须是一个dict\n", + "### 3. Trainer的训练过程\n", + " (1) 从DataSet中按照batch_size取出一个batch,调用Model.forward\n", + " (2) 将 Model.forward的结果 与 标记为target的field 传入Losser当中。\n", + " 由于每个人写的Model.forward的output的dict可能key并不一样,比如有人是{'pred':xxx}, {'output': xxx}; \n", + " 另外每个人将target可能也会设置为不同的名称, 比如有人是label, 有人设置为target;\n", + " 为了解决以上的问题,我们的loss提供映射机制\n", + " 比如CrossEntropyLosser的需要的输入是(prediction, target)。但是forward的output是{'output': xxx}; 'label'是target\n", + " 那么初始化losser的时候写为CrossEntropyLosser(prediction='output', target='label')即可\n", + " (3) 对于Metric是同理的\n", + " Metric计算也是从 forward的结果中取值 与 设置target的field中取值。 也是可以通过映射找到对应的值 \n", + " \n", + " \n", + "\n", + "## 一些问题.\n", + "### 1. DataSet中为什么需要设置input和target\n", + " 只有被设置为input或者target的数据才会在train的过程中被取出来\n", + " (1.1) 我们只会在设置为input的field中寻找传递给Model.forward的参数。\n", + " (1.2) 我们在传递值给losser或者metric的时候会使用来自: \n", + " (a)Model.forward的output\n", + " (b)被设置为target的field\n", + " \n", + "\n", + "### 2. 我们是通过forwad中的形参名将DataSet中的field赋值给对应的参数\n", + " (1.1) 构建模型过程中,\n", + " 例如:\n", + " DataSet中x,seq_lens是input,那么forward就应该是\n", + " def forward(self, x, seq_lens):\n", + " pass\n", + " 我们是通过形参名称进行匹配的field的\n", + " \n", + "\n", + "\n", + "### 1. 加载数据到DataSet\n", + "### 2. 使用apply操作对DataSet进行预处理\n", + " (2.1) 处理过程中将某些field设置为input,某些field设置为target\n", + "### 3. 构建模型\n", + " (3.1) 构建模型过程中,需要注意forward函数的形参名需要和DataSet中设置为input的field名称是一致的。\n", + " 例如:\n", + " DataSet中x,seq_lens是input,那么forward就应该是\n", + " def forward(self, x, seq_lens):\n", + " pass\n", + " 我们是通过形参名称进行匹配的field的\n", + " (3.2) 模型的forward的output需要是dict类型的。\n", + " 建议将输出设置为{\"pred\": xx}.\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From cd83866527c8b947f072d473660623343aee3919 Mon Sep 17 00:00:00 2001 From: yh Date: Thu, 6 Dec 2018 11:16:25 +0800 Subject: [PATCH 61/67] bug fix in LossInForward --- fastNLP/core/losses.py | 3 ++- fastNLP/core/utils.py | 22 +++++++++++++--------- test/core/test_trainer.py | 6 +++--- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index a4976540..fbd64e81 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -221,7 +221,8 @@ def __init__(self, loss_key='loss'): def get_loss(self, **kwargs): if self.loss_key not in kwargs: - check_res = CheckRes(missing=[self.loss_key], + check_res = CheckRes(missing=[self.loss_key + f"(assign to `{self.loss_key}` " \ + f"in `{self.__class__.__name__}`"], unused=[], duplicated=[], required=[], diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 508d5587..c58e4f71 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -257,7 +257,7 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re if _unused_param: unuseds.append(f"\tunused param: {_unused_param}") # output from predict or forward - module_name = '' + module_name = func_signature.split('.')[0] if check_res.missing: errs.append(f"\tmissing param: {check_res.missing}") import re @@ -265,15 +265,19 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re unmapped_missing = [] input_func_map = {} for _miss in check_res.missing: - fun_arg, module_name = re.findall("(?<=`)[a-zA-Z0-9]*?(?=`)", _miss) if '(' in _miss: # if they are like 'SomeParam(assign to xxx)' _miss = _miss.split('(')[0] - input_func_map[_miss] = fun_arg - if fun_arg == _miss: - unmapped_missing.append(_miss) + matches = re.findall("(?<=`)[a-zA-Z0-9]*?(?=`)", _miss) + if len(matches) == 2: + fun_arg, module_name = matches + input_func_map[_miss] = fun_arg + if fun_arg == _miss: + unmapped_missing.append(_miss) + else: + mapped_missing.append(_miss) else: - mapped_missing.append(_miss) + unmapped_missing.append(_miss) for _miss in mapped_missing: if _miss in dataset: @@ -281,7 +285,7 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re else: _tmp = '' if check_res.unused: - _tmp = f"Check key assignment for `{input_func_map[_miss]}` when initialize {module_name}." + _tmp = f"Check key assignment for `{input_func_map.get(_miss, _miss)}` when initialize {module_name}." if _tmp: _tmp += f' Or provide {_miss} in DataSet or output of {prev_func_signature}.' else: @@ -293,11 +297,11 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re else: _tmp = '' if check_res.unused: - _tmp = f"Specify your assignment for `{input_func_map[_miss]}` when initialize {module_name}." + _tmp = f"Specify your assignment for `{input_func_map.get(_miss, _miss)}` when initialize {module_name}." if _tmp: _tmp += f' Or provide {_miss} in DataSet or output of {prev_func_signature}.' else: - _tmp = f'Provide {_miss} in DataSet or output of {prev_func_signature}.' + _tmp = f'Provide {_miss} in output of {prev_func_signature} or DataSet.' suggestions.append(_tmp) if check_res.duplicated: diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index 6f6fbbf3..2f2505e4 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -159,8 +159,8 @@ def forward(self, x1, x2, y): def test_trainer_suggestion4(self): # 检查报错提示能否正确提醒用户 # 这里传入forward需要的数据,是否可以正确提示unused - dataset = prepare_fake_dataset2('x1', 'x_unused') - dataset.set_input('x1', 'x_unused', 'y', flag=True) + dataset = prepare_fake_dataset2('x1', 'x2') + dataset.set_input('x1', 'x2', 'y', flag=True) class Model(nn.Module): def __init__(self): super().__init__() @@ -170,7 +170,7 @@ def forward(self, x1, x2, y): x2 = self.fc(x2) x = x1 + x2 loss = F.cross_entropy(x, y) - return {'loss': loss} + return {'losses': loss} model = Model() with self.assertRaises(NameError): From 27e9453d19dd61141f9def91cfbeb5c68bd268bf Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Thu, 6 Dec 2018 19:28:27 +0800 Subject: [PATCH 62/67] * fix processor.py * add code comments * merge *_saver.py & *_loader.py in io/ * (ancient codes) rename Loss into LossFromTorch --- fastNLP/api/model_zoo.py | 8 +- fastNLP/api/processor.py | 34 ++-- fastNLP/core/__init__.py | 2 +- fastNLP/core/dataset.py | 66 +++++--- fastNLP/core/losses.py | 55 ++++--- fastNLP/core/metrics.py | 7 + fastNLP/core/optimizer.py | 12 ++ fastNLP/core/trainer.py | 9 -- fastNLP/io/base_loader.py | 16 -- fastNLP/io/{config_saver.py => config_io.py} | 150 +++++++++++++++++- fastNLP/io/config_loader.py | 149 ----------------- fastNLP/io/dataset_loader.py | 126 +++------------ fastNLP/io/{model_saver.py => model_io.py} | 28 ++++ fastNLP/io/model_loader.py | 28 ---- reproduction/Biaffine_parser/infer.py | 2 +- reproduction/Biaffine_parser/run.py | 5 +- .../main.py | 4 +- reproduction/chinese_word_segment/run.py | 5 +- setup.py | 4 +- test/api/test_processor.py | 12 ++ test/core/test_loss.py | 10 +- test/io/test_config_saver.py | 3 +- 22 files changed, 349 insertions(+), 386 deletions(-) rename fastNLP/io/{config_saver.py => config_io.py} (52%) delete mode 100644 fastNLP/io/config_loader.py rename fastNLP/io/{model_saver.py => model_io.py} (51%) delete mode 100644 fastNLP/io/model_loader.py create mode 100644 test/api/test_processor.py diff --git a/fastNLP/api/model_zoo.py b/fastNLP/api/model_zoo.py index 9069ae55..a54a53d9 100644 --- a/fastNLP/api/model_zoo.py +++ b/fastNLP/api/model_zoo.py @@ -1,5 +1,3 @@ -import torch - import hashlib import os import re @@ -7,6 +5,8 @@ import sys import tempfile +import torch + try: from requests.utils import urlparse from requests import get as urlopen @@ -132,7 +132,3 @@ def __exit__(self, exc_type, exc_val, exc_tb): sys.stderr.write('\n') - -if __name__ == '__main__': - pipeline = load_url('http://10.141.208.102:5000/file/download/infer_context-4e86fd93.pkl', model_dir='.') - print(type(pipeline)) diff --git a/fastNLP/api/processor.py b/fastNLP/api/processor.py index 711f2b67..d6a68412 100644 --- a/fastNLP/api/processor.py +++ b/fastNLP/api/processor.py @@ -1,14 +1,15 @@ -import torch -from collections import defaultdict import re +from collections import defaultdict + +import torch -from fastNLP.core.dataset import DataSet -from fastNLP.core.vocabulary import Vocabulary from fastNLP.core.batch import Batch +from fastNLP.core.dataset import DataSet from fastNLP.core.sampler import SequentialSampler +from fastNLP.core.vocabulary import Vocabulary -class Processor: +class Processor(object): def __init__(self, field_name, new_added_field_name): self.field_name = field_name if new_added_field_name is None: @@ -17,7 +18,7 @@ def __init__(self, field_name, new_added_field_name): self.new_added_field_name = new_added_field_name def process(self, *args, **kwargs): - pass + raise NotImplementedError def __call__(self, *args, **kwargs): return self.process(*args, **kwargs) @@ -132,13 +133,14 @@ def process(self, dataset): class IndexerProcessor(Processor): - def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False): + def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False, is_input=True): assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab)) super(IndexerProcessor, self).__init__(field_name, new_added_field_name) self.vocab = vocab self.delete_old_field = delete_old_field + self.is_input = is_input def set_vocab(self, vocab): assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab)) @@ -146,13 +148,14 @@ def set_vocab(self, vocab): self.vocab = vocab def process(self, dataset): - assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) + assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) for ins in dataset: tokens = ins[self.field_name] index = [self.vocab.to_index(token) for token in tokens] ins[self.new_added_field_name] = index - dataset._set_need_tensor(**{self.new_added_field_name: True}) + if self.is_input: + dataset.set_input(self.new_added_field_name) if self.delete_old_field: dataset.delete_field(self.field_name) @@ -161,6 +164,9 @@ def process(self, dataset): class VocabProcessor(Processor): + """Build vocabulary with a field in the data set. + + """ def __init__(self, field_name): super(VocabProcessor, self).__init__(field_name, None) self.vocab = Vocabulary() @@ -178,17 +184,20 @@ def get_vocab(self): class SeqLenProcessor(Processor): - def __init__(self, field_name, new_added_field_name='seq_lens'): + def __init__(self, field_name, new_added_field_name='seq_lens', is_input=True): super(SeqLenProcessor, self).__init__(field_name, new_added_field_name) + self.is_input = is_input def process(self, dataset): assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) for ins in dataset: length = len(ins[self.field_name]) ins[self.new_added_field_name] = length - dataset._set_need_tensor(**{self.new_added_field_name: True}) + if self.is_input: + dataset.set_input(self.new_added_field_name) return dataset + class ModelProcessor(Processor): def __init__(self, model, seq_len_field_name='seq_lens', batch_size=32): """ @@ -238,6 +247,7 @@ def set_model_device(self, device): device = torch.device(device) self.model.to(device) + class Index2WordProcessor(Processor): def __init__(self, vocab, field_name, new_added_field_name): super(Index2WordProcessor, self).__init__(field_name, new_added_field_name) @@ -251,6 +261,7 @@ def process(self, dataset): class SetTensorProcessor(Processor): + # TODO: remove it. It is strange. def __init__(self, field_dict, default=False): super(SetTensorProcessor, self).__init__(None, None) self.field_dict = field_dict @@ -264,6 +275,7 @@ def process(self, dataset): class SetIsTargetProcessor(Processor): + # TODO; remove it. def __init__(self, field_dict, default=False): super(SetIsTargetProcessor, self).__init__(None, None) self.field_dict = field_dict diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index b16fe165..b62d5624 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -2,7 +2,7 @@ from .dataset import DataSet from .fieldarray import FieldArray from .instance import Instance -from .losses import Loss +from .losses import LossFromTorch from .optimizer import Optimizer from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler from .tester import Tester diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 57171e25..f4963d0a 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -9,32 +9,20 @@ _READERS = {} -def construct_dataset(sentences): - """Construct a data set from a list of sentences. - - :param sentences: list of list of str - :return dataset: a DataSet object - """ - dataset = DataSet() - for sentence in sentences: - instance = Instance() - instance['raw_sentence'] = sentence - dataset.append(instance) - return dataset - - class DataSet(object): """DataSet is the collection of examples. DataSet provides instance-level interface. You can append and access an instance of the DataSet. However, it stores data in a different way: Field-first, Instance-second. """ + def __init__(self, data=None): """ - :param data: a dict or a list. If it is a dict, the key is the name of a field and the value is the field. - All values must be of the same length. - If it is a list, it must be a list of Instance objects. + :param data: a dict or a list. + If `data` is a dict, the key is the name of a FieldArray and the value is the FieldArray. All values + must be of the same length. + If `data` is a list, it must be a list of Instance objects. """ self.field_arrays = {} if data is not None: @@ -60,6 +48,7 @@ def __iter__(self): def iter_func(): for idx in range(len(self)): yield self[idx] + return iter_func() def _inner_iter(self): @@ -69,7 +58,8 @@ def __init__(self, dataset, idx): self.idx = idx def __getitem__(self, item): - assert item in self.dataset.field_arrays, "no such field:{} in Instance {}".format(item, self.dataset[self.idx]) + assert item in self.dataset.field_arrays, "no such field:{} in Instance {}".format(item, self.dataset[ + self.idx]) assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx) return self.dataset.field_arrays[item][self.idx] @@ -79,6 +69,7 @@ def __repr__(self): def inner_iter_func(): for idx in range(len(self)): yield Iter_ptr(self, idx) + return inner_iter_func() def __getitem__(self, idx): @@ -217,9 +208,17 @@ def set_input(self, *field_name, flag=True): raise KeyError("{} is not a valid field name.".format(name)) def get_input_name(self): + """Get all field names with `is_input` as True. + + :return list field_names: a list of str + """ return [name for name, field in self.field_arrays.items() if field.is_input] def get_target_name(self): + """Get all field names with `is_target` as True. + + :return list field_names: a list of str + """ return [name for name, field in self.field_arrays.items() if field.is_target] @classmethod @@ -243,7 +242,7 @@ def apply(self, func, new_field_name=None, **kwargs): :return results: if new_field_name is not passed, returned values of the function over all instances. """ results = [func(ins) for ins in self._inner_iter()] - if len(list(filter(lambda x: x is not None, results)))==0: # all None + if len(list(filter(lambda x: x is not None, results))) == 0: # all None raise ValueError("{} always return None.".format(get_func_signature(func=func))) extra_param = {} @@ -269,6 +268,12 @@ def apply(self, func, new_field_name=None, **kwargs): return results def drop(self, func): + """Drop instances if a condition holds. + + :param func: a function that takes an Instance object as input, and returns bool. + The instance will be dropped if the function returns True. + + """ results = [ins for ins in self._inner_iter() if not func(ins)] for name, old_field in self.field_arrays.items(): self.field_arrays[name].content = [ins[name] for ins in results] @@ -338,10 +343,33 @@ def read_csv(cls, csv_path, headers=None, sep=",", dropna=True): return cls(_dict) def save(self, path): + """Save the DataSet object as pickle. + + :param str path: the path to the pickle + """ with open(path, 'wb') as f: pickle.dump(self, f) @staticmethod def load(path): + """Load a DataSet object from pickle. + + :param str path: the path to the pickle + :return DataSet data_set: + """ with open(path, 'rb') as f: return pickle.load(f) + + +def construct_dataset(sentences): + """Construct a data set from a list of sentences. + + :param sentences: list of list of str + :return dataset: a DataSet object + """ + dataset = DataSet() + for sentence in sentences: + instance = Instance() + instance['raw_sentence'] = sentence + dataset.append(instance) + return dataset diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index fbd64e81..ed935c9d 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -7,14 +7,13 @@ from fastNLP.core.utils import CheckError from fastNLP.core.utils import CheckRes from fastNLP.core.utils import _build_args -from fastNLP.core.utils import _check_function_or_method from fastNLP.core.utils import _check_arg_dict_list +from fastNLP.core.utils import _check_function_or_method from fastNLP.core.utils import get_func_signature class LossBase(object): def __init__(self): - # key: name in target function; value: name in output function self.param_map = {} self._checked = False @@ -159,8 +158,18 @@ def __call__(self, pred_dict, target_dict, check=False): return loss + class LossFunc(LossBase): + """A wrapper of user-provided loss function. + + """ def __init__(self, func, key_map=None, **kwargs): + """ + + :param func: a callable object, such as a function. + :param dict key_map: + :param kwargs: + """ super(LossFunc, self).__init__() _check_function_or_method(func) if key_map is not None: @@ -254,19 +263,19 @@ def _prepare_losser(losser): def squash(predict, truth, **kwargs): - '''To reshape tensors in order to fit Loss functions in pytorch + """To reshape tensors in order to fit loss functions in pytorch :param predict : Tensor, model output :param truth : Tensor, truth from dataset :param **kwargs : extra arguments :return predict , truth: predict & truth after processing - ''' + """ return predict.view(-1, predict.size()[-1]), truth.view(-1, ) def unpad(predict, truth, **kwargs): - '''To process padded sequence output to get true loss + """To process padded sequence output to get true loss Using pack_padded_sequence() method This method contains squash() @@ -277,7 +286,7 @@ def unpad(predict, truth, **kwargs): the i-th element is true lengths of i-th sequence :return predict , truth: predict & truth after processing - ''' + """ if kwargs.get("lens") is None: return predict, truth lens = torch.LongTensor(kwargs["lens"]) @@ -288,7 +297,7 @@ def unpad(predict, truth, **kwargs): def unpad_mask(predict, truth, **kwargs): - '''To process padded sequence output to get true loss + """To process padded sequence output to get true loss Using mask() method This method contains squash() @@ -299,7 +308,7 @@ def unpad_mask(predict, truth, **kwargs): the i-th element is true lengths of i-th sequence :return predict , truth: predict & truth after processing - ''' + """ if kwargs.get("lens") is None: return predict, truth mas = make_mask(kwargs["lens"], truth.size()[1]) @@ -307,7 +316,7 @@ def unpad_mask(predict, truth, **kwargs): def mask(predict, truth, **kwargs): - '''To select specific elements from Tensor + """To select specific elements from Tensor This method contains squash() :param predict : Tensor, [batch_size , max_len , tag_size] @@ -317,7 +326,7 @@ def mask(predict, truth, **kwargs): the mask Tensor , the position that is 1 will be selected :return predict , truth: predict & truth after processing - ''' + """ if kwargs.get("mask") is None: return predict, truth mask = kwargs["mask"] @@ -332,14 +341,14 @@ def mask(predict, truth, **kwargs): def make_mask(lens, tar_len): - '''to generate a mask that select [:lens[i]] for i-th element + """to generate a mask that select [:lens[i]] for i-th element embezzle from fastNLP.models.sequence_modeling.seq_mask :param lens : list or LongTensor, [batch_size] :param tar_len : int :return mask : ByteTensor - ''' + """ lens = torch.LongTensor(lens) mask = [torch.ge(lens, i + 1) for i in range(tar_len)] mask = torch.stack(mask, 1) @@ -376,9 +385,11 @@ def make_mask(lens, tar_len): } -class Loss(object): - """a Loss object is a callable object represents loss functions +class LossFromTorch(object): + """a LossFromTorch object is a callable object represents loss functions + This class only helps you with loss functions from PyTorch. + It has nothing to do with Trainer. """ def __init__(self, loss_name, pre_pro=[squash], **kwargs): @@ -408,11 +419,11 @@ def __init__(self, loss_name, pre_pro=[squash], **kwargs): self.pre_pro = [f if callable(f) else method_dict.get(f) for f in pre_pro] def add_pre_pro(self, func): - '''add a pre_pro function + """add a pre_pro function :param func: a function or str, methods to reform parameters before calculating loss the strings will be auto translated to pre-defined functions - ''' + """ if not callable(func): func = method_dict.get(func) if func is None: @@ -421,12 +432,12 @@ def add_pre_pro(self, func): @staticmethod def _get_loss(loss_name, **kwargs): - '''Get loss function from torch + """Get loss function from torch :param loss_name: str, the name of loss function :param **kwargs: kwargs for torch loss function :return: A callable loss function object - ''' + """ loss_name = loss_name.strip().lower() loss_name = "".join(loss_name.split("_")) @@ -435,19 +446,19 @@ def _get_loss(loss_name, **kwargs): return loss_function_name[loss_name](**kwargs) def get(self): - '''This method exists just for make some existing codes run error-freely - ''' + """This method exists just for make some existing codes run error-freely + """ return self def __call__(self, predict, truth, **kwargs): - '''call a loss function + """Call a loss function predict and truth will be processed by pre_pro methods in order of addition :param predict : Tensor, model output :param truth : Tensor, truth from dataset :param **kwargs : extra arguments, pass to pre_pro functions for example, if used unpad_mask() in pre_pro, there should be a kwarg named lens - ''' + """ for f in self.pre_pro: if f is None: continue diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 32c2306f..929d6ee1 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -308,6 +308,13 @@ def _prepare_metrics(metrics): return _metrics +""" + Attention: Codes below are not used in current FastNLP. + However, it is useful. + +""" + + def _conver_numpy(x): """convert input data to numpy array diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py index 469c5632..dfcf83f9 100644 --- a/fastNLP/core/optimizer.py +++ b/fastNLP/core/optimizer.py @@ -11,6 +11,12 @@ def __init__(self, model_params, **kwargs): class SGD(Optimizer): def __init__(self, model_params=None, lr=0.01, momentum=0): + """ + + :param model_params: a generator. E.g. model.parameters() for PyTorch models. + :param float lr: learning rate. Default: 0.01 + :param float momentum: momentum. Default: 0 + """ super(SGD, self).__init__(model_params, lr=lr, momentum=momentum) def construct_from_pytorch(self, model_params): @@ -23,6 +29,12 @@ def construct_from_pytorch(self, model_params): class Adam(Optimizer): def __init__(self, model_params=None, lr=0.01, weight_decay=0): + """ + + :param model_params: a generator. E.g. model.parameters() for PyTorch models. + :param float lr: learning rate + :param float weight_decay: + """ super(Adam, self).__init__(model_params, lr=lr, weight_decay=weight_decay) def construct_from_pytorch(self, model_params): diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index a3f81c00..c2bca3a2 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -140,7 +140,6 @@ def __init__(self, train_data, model, loss=None, metrics=None, n_epochs=3, batch def train(self): """Start Training. - :return: """ try: if torch.cuda.is_available() and self.use_cuda: @@ -216,14 +215,6 @@ def _tqdm_train(self): pbar.close() def _print_train(self): - """ - - :param data_iterator: - :param model: - :param epoch: - :param start: - :return: - """ epoch = 1 start = time.time() while epoch <= self.n_epochs: diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py index b67bc4ab..b0b0d864 100644 --- a/fastNLP/io/base_loader.py +++ b/fastNLP/io/base_loader.py @@ -29,19 +29,3 @@ def load_with_cache(cls, data_path, cache_path): with open(cache_path, 'wb') as f: pickle.dump(obj, f) return obj - - -class ToyLoader0(BaseLoader): - """ - For CharLM - """ - - def __init__(self, data_path): - super(ToyLoader0, self).__init__(data_path) - - def load(self): - with open(self.data_path, 'r') as f: - corpus = f.read().lower() - import re - corpus = re.sub(r"", "unk", corpus) - return corpus.split() diff --git a/fastNLP/io/config_saver.py b/fastNLP/io/config_io.py similarity index 52% rename from fastNLP/io/config_saver.py rename to fastNLP/io/config_io.py index 49d6804d..52c5e789 100644 --- a/fastNLP/io/config_saver.py +++ b/fastNLP/io/config_io.py @@ -1,6 +1,152 @@ +import configparser +import json import os -from fastNLP.io.config_loader import ConfigSection, ConfigLoader +from fastNLP.io.base_loader import BaseLoader + + +class ConfigLoader(BaseLoader): + """loader for configuration files""" + + def __init__(self, data_path=None): + super(ConfigLoader, self).__init__() + if data_path is not None: + self.config = self.parse(super(ConfigLoader, self).load(data_path)) + + @staticmethod + def parse(string): + raise NotImplementedError + + @staticmethod + def load_config(file_path, sections): + """ + :param file_path: the path of config file + :param sections: the dict of {section_name(string): Section instance} + Example: + test_args = ConfigSection() + ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) + :return: return nothing, but the value of attributes are saved in sessions + """ + assert isinstance(sections, dict) + cfg = configparser.ConfigParser() + if not os.path.exists(file_path): + raise FileNotFoundError("config file {} not found. ".format(file_path)) + cfg.read(file_path) + for s in sections: + attr_list = [i for i in sections[s].__dict__.keys() if + not callable(getattr(sections[s], i)) and not i.startswith("__")] + if s not in cfg: + print('section %s not found in config file' % (s)) + continue + gen_sec = cfg[s] + for attr in gen_sec.keys(): + try: + val = json.loads(gen_sec[attr]) + # print(s, attr, val, type(val)) + if attr in attr_list: + assert type(val) == type(getattr(sections[s], attr)), \ + 'type not match, except %s but got %s' % \ + (type(getattr(sections[s], attr)), type(val)) + """ + if attr in attr_list then check its type and + update its value. + else add a new attr in sections[s] + """ + setattr(sections[s], attr, val) + except Exception as e: + print("cannot load attribute %s in section %s" + % (attr, s)) + pass + + +class ConfigSection(object): + + def __init__(self): + pass + + def __getitem__(self, key): + """ + :param key: str, the name of the attribute + :return attr: the value of this attribute + if key not in self.__dict__.keys(): + return self[key] + else: + raise AttributeError + """ + if key in self.__dict__.keys(): + return getattr(self, key) + raise AttributeError("do NOT have attribute %s" % key) + + def __setitem__(self, key, value): + """ + :param key: str, the name of the attribute + :param value: the value of this attribute + if key not in self.__dict__.keys(): + self[key] will be added + else: + self[key] will be updated + """ + if key in self.__dict__.keys(): + if not isinstance(value, type(getattr(self, key))): + raise AttributeError("attr %s except %s but got %s" % + (key, str(type(getattr(self, key))), str(type(value)))) + setattr(self, key, value) + + def __contains__(self, item): + """ + :param item: The key of item. + :return: True if the key in self.__dict__.keys() else False. + """ + return item in self.__dict__.keys() + + def __eq__(self, other): + """Overwrite the == operator + + :param other: Another ConfigSection() object which to be compared. + :return: True if value of each key in each ConfigSection() object are equal to the other, else False. + """ + for k in self.__dict__.keys(): + if k not in other.__dict__.keys(): + return False + if getattr(self, k) != getattr(self, k): + return False + + for k in other.__dict__.keys(): + if k not in self.__dict__.keys(): + return False + if getattr(self, k) != getattr(self, k): + return False + + return True + + def __ne__(self, other): + """Overwrite the != operator + + :param other: + :return: + """ + return not self.__eq__(other) + + @property + def data(self): + return self.__dict__ + + +if __name__ == "__main__": + config = ConfigLoader('there is no data') + + section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()} + """ + General and My can be found in config file, so the attr and + value will be updated + A cannot be found in config file, so nothing will be done + """ + + config.load_config("../../test/data_for_tests/config", section) + for s in section: + print(s) + for attr in section[s].__dict__.keys(): + print(s, attr, getattr(section[s], attr), type(getattr(section[s], attr))) class ConfigSaver(object): @@ -125,7 +271,7 @@ def save_config_file(self, section_name, section): # logger = create_logger(__name__, "./config_loader.log") # logger.warning("section [%s] in config file [%s] has been changed" % ( # section_name, self.file_path - #)) + # )) change_file = True break if not change_file: diff --git a/fastNLP/io/config_loader.py b/fastNLP/io/config_loader.py deleted file mode 100644 index 66051e4d..00000000 --- a/fastNLP/io/config_loader.py +++ /dev/null @@ -1,149 +0,0 @@ -import configparser -import json -import os - -from fastNLP.io.base_loader import BaseLoader - - -class ConfigLoader(BaseLoader): - """loader for configuration files""" - - def __init__(self, data_path=None): - super(ConfigLoader, self).__init__() - if data_path is not None: - self.config = self.parse(super(ConfigLoader, self).load(data_path)) - - @staticmethod - def parse(string): - raise NotImplementedError - - @staticmethod - def load_config(file_path, sections): - """ - :param file_path: the path of config file - :param sections: the dict of {section_name(string): Section instance} - Example: - test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) - :return: return nothing, but the value of attributes are saved in sessions - """ - assert isinstance(sections, dict) - cfg = configparser.ConfigParser() - if not os.path.exists(file_path): - raise FileNotFoundError("config file {} not found. ".format(file_path)) - cfg.read(file_path) - for s in sections: - attr_list = [i for i in sections[s].__dict__.keys() if - not callable(getattr(sections[s], i)) and not i.startswith("__")] - if s not in cfg: - print('section %s not found in config file' % (s)) - continue - gen_sec = cfg[s] - for attr in gen_sec.keys(): - try: - val = json.loads(gen_sec[attr]) - # print(s, attr, val, type(val)) - if attr in attr_list: - assert type(val) == type(getattr(sections[s], attr)), \ - 'type not match, except %s but got %s' % \ - (type(getattr(sections[s], attr)), type(val)) - """ - if attr in attr_list then check its type and - update its value. - else add a new attr in sections[s] - """ - setattr(sections[s], attr, val) - except Exception as e: - print("cannot load attribute %s in section %s" - % (attr, s)) - pass - - -class ConfigSection(object): - - def __init__(self): - pass - - def __getitem__(self, key): - """ - :param key: str, the name of the attribute - :return attr: the value of this attribute - if key not in self.__dict__.keys(): - return self[key] - else: - raise AttributeError - """ - if key in self.__dict__.keys(): - return getattr(self, key) - raise AttributeError("do NOT have attribute %s" % key) - - def __setitem__(self, key, value): - """ - :param key: str, the name of the attribute - :param value: the value of this attribute - if key not in self.__dict__.keys(): - self[key] will be added - else: - self[key] will be updated - """ - if key in self.__dict__.keys(): - if not isinstance(value, type(getattr(self, key))): - raise AttributeError("attr %s except %s but got %s" % - (key, str(type(getattr(self, key))), str(type(value)))) - setattr(self, key, value) - - def __contains__(self, item): - """ - :param item: The key of item. - :return: True if the key in self.__dict__.keys() else False. - """ - return item in self.__dict__.keys() - - def __eq__(self, other): - """Overwrite the == operator - - :param other: Another ConfigSection() object which to be compared. - :return: True if value of each key in each ConfigSection() object are equal to the other, else False. - """ - for k in self.__dict__.keys(): - if k not in other.__dict__.keys(): - return False - if getattr(self, k) != getattr(self, k): - return False - - for k in other.__dict__.keys(): - if k not in self.__dict__.keys(): - return False - if getattr(self, k) != getattr(self, k): - return False - - return True - - def __ne__(self, other): - """Overwrite the != operator - - :param other: - :return: - """ - return not self.__eq__(other) - - @property - def data(self): - return self.__dict__ - - -if __name__ == "__main__": - config = ConfigLoader('there is no data') - - section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()} - """ - General and My can be found in config file, so the attr and - value will be updated - A cannot be found in config file, so nothing will be done - """ - - config.load_config("../../test/data_for_tests/config", section) - for s in section: - print(s) - for attr in section[s].__dict__.keys(): - print(s, attr, getattr(section[s], attr), type(getattr(section[s], attr))) diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index 79cb30ad..fc2edb23 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -1,4 +1,3 @@ -#TODO: need fix for current DataSet import os from fastNLP.core.dataset import DataSet @@ -20,8 +19,7 @@ def convert_seq_dataset(data): """ dataset = DataSet() for word_seq in data: - x = TextField(word_seq, is_target=False) - dataset.append(Instance(word_seq=x)) + dataset.append(Instance(word_seq=word_seq)) return dataset @@ -40,11 +38,7 @@ def convert_seq2tag_dataset(data): """ dataset = DataSet() for sample in data: - word_seq, label = sample[0], sample[1] - ins = Instance() - ins.add_field("word_seq", TextField(word_seq, is_target=False)) \ - .add_field("label", LabelField(label, is_target=True)) - dataset.append(ins) + dataset.append(Instance(word_seq=sample[0], label=sample[1])) return dataset @@ -63,11 +57,7 @@ def convert_seq2seq_dataset(data): """ dataset = DataSet() for sample in data: - word_seq, label_seq = sample[0], sample[1] - ins = Instance() - ins.add_field("word_seq", TextField(word_seq, is_target=False)) \ - .add_field("label_seq", TextField(label_seq, is_target=True)) - dataset.append(ins) + dataset.append(Instance(word_seq=sample[0], label_seq=sample[1])) return dataset @@ -273,85 +263,6 @@ def convert(self, data): return convert_seq2tag_dataset(data) -@DataSet.set_reader('read_conll') -class ConllLoader(DataSetLoader): - """loader for conll format files""" - - def __init__(self): - """ - :param str data_path: the path to the conll data set - """ - super(ConllLoader, self).__init__() - - def load(self, data_path): - """ - :return: list lines: all lines in a conll file - """ - with open(data_path, "r", encoding="utf-8") as f: - lines = f.readlines() - data = self.parse(lines) - return self.convert(data) - - @staticmethod - def parse(lines): - """ - :param list lines:a list containing all lines in a conll file. - :return: a 3D list - """ - sentences = list() - tokens = list() - for line in lines: - if line[0] == "#": - # skip the comments - continue - if line == "\n": - sentences.append(tokens) - tokens = [] - continue - tokens.append(line.split()) - return sentences - - def convert(self, data): - pass - - -@DataSet.set_reader('read_lm') -class LMDataSetLoader(DataSetLoader): - """Language Model Dataset Loader - - This loader produces data for language model training in a supervised way. - That means it has X and Y. - - """ - - def __init__(self): - super(LMDataSetLoader, self).__init__() - - def load(self, data_path): - if not os.path.exists(data_path): - raise FileNotFoundError("file {} not found.".format(data_path)) - with open(data_path, "r", encoding="utf=8") as f: - text = " ".join(f.readlines()) - tokens = text.strip().split() - data = self.sentence_cut(tokens) - return self.convert(data) - - def sentence_cut(self, tokens, sentence_length=15): - start_idx = 0 - data_set = [] - for idx in range(len(tokens) // sentence_length): - x = tokens[start_idx * idx: start_idx * idx + sentence_length] - y = tokens[start_idx * idx + 1: start_idx * idx + sentence_length + 1] - if start_idx * idx + sentence_length + 1 >= len(tokens): - # ad hoc - y.extend([""]) - data_set.append([x, y]) - return data_set - - def convert(self, data): - pass - - @DataSet.set_reader('read_people_daily') class PeopleDailyCorpusLoader(DataSetLoader): """ @@ -403,10 +314,19 @@ def load(self, data_path): pos_tag_examples.append([sent_words, sent_pos_tag]) ner_examples.append([sent_words, sent_ner]) # List[List[List[str], List[str]]] - return pos_tag_examples, ner_examples + # ner_examples not used + return self.convert(pos_tag_examples) def convert(self, data): - pass + data_set = DataSet() + for item in data: + sent_words, sent_pos_tag = item[0], item[1] + data_set.append(Instance(words=sent_words, tags=sent_pos_tag)) + data_set.apply(lambda ins: len(ins), new_field_name="seq_len") + data_set.set_target("tags") + data_set.set_input("sent_words") + data_set.set_input("seq_len") + return data_set class SNLIDataSetLoader(DataSetLoader): @@ -462,17 +382,13 @@ def convert(self, data): for example in data: p, h, l = example # list, list, str - x1 = TextField(p, is_target=False) - x2 = TextField(h, is_target=False) - x1_len = TextField([1] * len(p), is_target=False) - x2_len = TextField([1] * len(h), is_target=False) - y = LabelField(l, is_target=True) instance = Instance() - instance.add_field("premise", x1) - instance.add_field("hypothesis", x2) - instance.add_field("premise_len", x1_len) - instance.add_field("hypothesis_len", x2_len) - instance.add_field("truth", y) + instance.add_field("premise", p) + instance.add_field("hypothesis", h) + instance.add_field("truth", l) data_set.append(instance) - + data_set.apply(lambda ins: len(ins["premise"]), new_field_name="premise_len") + data_set.apply(lambda ins: len(ins["hypothesis"]), new_field_name="hypothesis_len") + data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len") + data_set.set_target("truth") return data_set diff --git a/fastNLP/io/model_saver.py b/fastNLP/io/model_io.py similarity index 51% rename from fastNLP/io/model_saver.py rename to fastNLP/io/model_io.py index fd391f69..e1264b47 100644 --- a/fastNLP/io/model_saver.py +++ b/fastNLP/io/model_io.py @@ -1,5 +1,32 @@ import torch +from fastNLP.io.base_loader import BaseLoader + + +class ModelLoader(BaseLoader): + """ + Loader for models. + """ + + def __init__(self): + super(ModelLoader, self).__init__() + + @staticmethod + def load_pytorch(empty_model, model_path): + """ + Load model parameters from .pkl files into the empty PyTorch model. + :param empty_model: a PyTorch model with initialized parameters. + :param model_path: str, the path to the saved model. + """ + empty_model.load_state_dict(torch.load(model_path)) + + @staticmethod + def load_pytorch_model(model_path): + """Load the entire model. + + """ + return torch.load(model_path) + class ModelSaver(object): """Save a model @@ -8,6 +35,7 @@ class ModelSaver(object): saver.save_pytorch(model) """ + def __init__(self, save_path): """ diff --git a/fastNLP/io/model_loader.py b/fastNLP/io/model_loader.py deleted file mode 100644 index afa05b93..00000000 --- a/fastNLP/io/model_loader.py +++ /dev/null @@ -1,28 +0,0 @@ -import torch - -from fastNLP.io.base_loader import BaseLoader - - -class ModelLoader(BaseLoader): - """ - Loader for models. - """ - - def __init__(self): - super(ModelLoader, self).__init__() - - @staticmethod - def load_pytorch(empty_model, model_path): - """ - Load model parameters from .pkl files into the empty PyTorch model. - :param empty_model: a PyTorch model with initialized parameters. - :param model_path: str, the path to the saved model. - """ - empty_model.load_state_dict(torch.load(model_path)) - - @staticmethod - def load_pytorch_model(model_path): - """Load the entire model. - - """ - return torch.load(model_path) diff --git a/reproduction/Biaffine_parser/infer.py b/reproduction/Biaffine_parser/infer.py index 7d05c62b..8ebfa91c 100644 --- a/reproduction/Biaffine_parser/infer.py +++ b/reproduction/Biaffine_parser/infer.py @@ -5,7 +5,7 @@ from fastNLP.api.processor import * from fastNLP.models.biaffine_parser import BiaffineParser -from fastNLP.io.config_loader import ConfigSection, ConfigLoader +from fastNLP.io.config_io import ConfigSection, ConfigLoader import _pickle as pickle import torch diff --git a/reproduction/Biaffine_parser/run.py b/reproduction/Biaffine_parser/run.py index 15dd3d4f..0519201a 100644 --- a/reproduction/Biaffine_parser/run.py +++ b/reproduction/Biaffine_parser/run.py @@ -13,11 +13,10 @@ from fastNLP.core.dataset import DataSet from fastNLP.core.field import TextField, SeqLabelField from fastNLP.core.tester import Tester -from fastNLP.io.config_loader import ConfigLoader, ConfigSection -from fastNLP.io.model_loader import ModelLoader +from fastNLP.io.config_io import ConfigLoader, ConfigSection +from fastNLP.io.model_io import ModelLoader, ModelSaver from fastNLP.io.embed_loader import EmbedLoader from fastNLP.models.biaffine_parser import BiaffineParser -from fastNLP.io.model_saver import ModelSaver BOS = '' EOS = '' diff --git a/reproduction/LSTM+self_attention_sentiment_analysis/main.py b/reproduction/LSTM+self_attention_sentiment_analysis/main.py index 2a64c8d3..61ab79f4 100644 --- a/reproduction/LSTM+self_attention_sentiment_analysis/main.py +++ b/reproduction/LSTM+self_attention_sentiment_analysis/main.py @@ -2,8 +2,8 @@ from fastNLP.core.trainer import ClassificationTrainer from fastNLP.core.utils import ClassPreprocess as Preprocess -from fastNLP.io.config_loader import ConfigLoader -from fastNLP.io.config_loader import ConfigSection +from fastNLP.io.config_io import ConfigLoader +from fastNLP.io.config_io import ConfigSection from fastNLP.io.dataset_loader import ClassDataSetLoader as Dataset_loader from fastNLP.models.base_model import BaseModel from fastNLP.modules.aggregator.self_attention import SelfAttention diff --git a/reproduction/chinese_word_segment/run.py b/reproduction/chinese_word_segment/run.py index 7dd5091a..e7804bae 100644 --- a/reproduction/chinese_word_segment/run.py +++ b/reproduction/chinese_word_segment/run.py @@ -3,12 +3,11 @@ sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) -from fastNLP.io.config_loader import ConfigLoader, ConfigSection +from fastNLP.io.config_io import ConfigLoader, ConfigSection from fastNLP.core.trainer import SeqLabelTrainer from fastNLP.io.dataset_loader import BaseLoader, TokenizeDataSetLoader from fastNLP.core.utils import load_pickle -from fastNLP.io.model_saver import ModelSaver -from fastNLP.io.model_loader import ModelLoader +from fastNLP.io.model_io import ModelLoader, ModelSaver from fastNLP.core.tester import SeqLabelTester from fastNLP.models.sequence_modeling import AdvSeqLabel from fastNLP.core.predictor import SeqLabelInfer diff --git a/setup.py b/setup.py index 0da887a3..a8b4834e 100644 --- a/setup.py +++ b/setup.py @@ -12,12 +12,12 @@ reqs = f.read() setup( - name='fastNLP', + name='FastNLP', version='0.1.1', description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team', long_description=readme, license=license, - author='fudanNLP', + author='FudanNLP', python_requires='>=3.5', packages=find_packages(), install_requires=reqs.strip().split('\n'), diff --git a/test/api/test_processor.py b/test/api/test_processor.py new file mode 100644 index 00000000..fa6133b9 --- /dev/null +++ b/test/api/test_processor.py @@ -0,0 +1,12 @@ +import unittest + +from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor +from fastNLP.core.dataset import DataSet + + +class TestProcessor(unittest.TestCase): + def test_FullSpaceToHalfSpaceProcessor(self): + ds = DataSet({"word": ["00, u1, u), (u2, u2"]}) + proc = FullSpaceToHalfSpaceProcessor("word") + ds = proc(ds) + self.assertTrue(ds.field_arrays["word"].content, ["00, u1, u), (u2, u2"]) diff --git a/test/core/test_loss.py b/test/core/test_loss.py index a7c303e2..52860b36 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -45,7 +45,7 @@ def test_case_2(self): # 验证squash()的正确性 log = math.log - loss_func = loss.Loss("nll") + loss_func = loss.LossFromTorch("nll") y = tc.Tensor( [ @@ -129,7 +129,7 @@ def test_case_4(self): lens = [4, 2, 1] y = tc.log(y) - loss_func = loss.Loss("nll", pre_pro=["unpad"]) + loss_func = loss.LossFromTorch("nll", pre_pro=["unpad"]) los = loss_func(y, gy, lens=lens) r = -log(.1) - log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1) @@ -169,7 +169,7 @@ def test_case_5(self): lens = [2, 4, 2] - loss_func = loss.Loss("nll", pre_pro=["mask"]) + loss_func = loss.LossFromTorch("nll", pre_pro=["mask"]) los = loss_func(y, gy, mask=mask) los2 = loss_func(y, gy, mask=loss.make_mask(lens, gy.size()[-1])) @@ -205,7 +205,7 @@ def test_case_6(self): y = tc.log(y) - loss_func = loss.Loss("nll", pre_pro=["unpad_mask"]) + loss_func = loss.LossFromTorch("nll", pre_pro=["unpad_mask"]) los = loss_func(y, gy, lens=lens) r = -log(.1) - log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1) @@ -235,7 +235,7 @@ def test_case_7(self): lens = [4, 2, 1] y = tc.log(y) - loss_func = loss.Loss("nll", pre_pro=[], weight=tc.Tensor([1, 1, 0])) + loss_func = loss.LossFromTorch("nll", pre_pro=[], weight=tc.Tensor([1, 1, 0])) loss_func.add_pre_pro("unpad_mask") los = loss_func(y, gy, lens=lens) diff --git a/test/io/test_config_saver.py b/test/io/test_config_saver.py index 4a223f91..f29097c5 100644 --- a/test/io/test_config_saver.py +++ b/test/io/test_config_saver.py @@ -1,8 +1,7 @@ import os import unittest -from fastNLP.io.config_loader import ConfigSection, ConfigLoader -from fastNLP.io.config_saver import ConfigSaver +from fastNLP.io.config_io import ConfigSection, ConfigLoader, ConfigSaver class TestConfigSaver(unittest.TestCase): From 72877c6ed5b8011ad367eff42178594f53dd87df Mon Sep 17 00:00:00 2001 From: yh Date: Fri, 7 Dec 2018 13:31:52 +0800 Subject: [PATCH 63/67] =?UTF-8?q?optimizer=E5=88=9D=E5=A7=8B=E5=8C=96?= =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=8F=82=E6=95=B0=E9=A1=BA=E5=BA=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/__init__.py | 7 ++++--- fastNLP/core/optimizer.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index b62d5624..44f30fad 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -2,9 +2,10 @@ from .dataset import DataSet from .fieldarray import FieldArray from .instance import Instance -from .losses import LossFromTorch -from .optimizer import Optimizer +from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward +from .metrics import AccuracyMetric +from .optimizer import Optimizer, SGD, Adam from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler from .tester import Tester from .trainer import Trainer -from .vocabulary import Vocabulary +from .vocabulary import Vocabulary \ No newline at end of file diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py index dfcf83f9..f123ae40 100644 --- a/fastNLP/core/optimizer.py +++ b/fastNLP/core/optimizer.py @@ -10,13 +10,15 @@ def __init__(self, model_params, **kwargs): class SGD(Optimizer): - def __init__(self, model_params=None, lr=0.01, momentum=0): + def __init__(self, lr=0.01, momentum=0, model_params=None): """ - :param model_params: a generator. E.g. model.parameters() for PyTorch models. :param float lr: learning rate. Default: 0.01 :param float momentum: momentum. Default: 0 + :param model_params: a generator. E.g. model.parameters() for PyTorch models. """ + if not isinstance(lr, float): + raise TypeError("learning rate has to be float.") super(SGD, self).__init__(model_params, lr=lr, momentum=momentum) def construct_from_pytorch(self, model_params): @@ -28,13 +30,15 @@ def construct_from_pytorch(self, model_params): class Adam(Optimizer): - def __init__(self, model_params=None, lr=0.01, weight_decay=0): + def __init__(self, lr=0.01, weight_decay=0, model_params=None): """ - :param model_params: a generator. E.g. model.parameters() for PyTorch models. :param float lr: learning rate :param float weight_decay: + :param model_params: a generator. E.g. model.parameters() for PyTorch models. """ + if not isinstance(lr, float): + raise TypeError("learning rate has to be float.") super(Adam, self).__init__(model_params, lr=lr, weight_decay=weight_decay) def construct_from_pytorch(self, model_params): From 447746d9f556d3052ca96400b1b538b545f04220 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Fri, 7 Dec 2018 13:22:04 +0800 Subject: [PATCH 64/67] * remove unused codes in losses.py & metrics.py * refine code style * fix tests * add a new tutorial --- fastNLP/core/losses.py | 115 +----------- fastNLP/core/metrics.py | 183 +------------------ fastNLP/io/dataset_loader.py | 12 ++ test/core/test_loss.py | 260 ++------------------------- test/core/test_metrics.py | 6 +- tutorials/fastnlp_in_six_lines.ipynb | 81 +++++++++ 6 files changed, 119 insertions(+), 538 deletions(-) create mode 100644 tutorials/fastnlp_in_six_lines.ipynb diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index ed935c9d..757ce465 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -195,6 +195,7 @@ def get_loss(self, pred, target): return F.cross_entropy(input=pred, target=target, ignore_index=self.padding_idx) + class L1Loss(LossBase): def __init__(self, pred=None, target=None): super(L1Loss, self).__init__() @@ -212,6 +213,7 @@ def __init__(self, pred=None, target=None): def get_loss(self, pred, target): return F.binary_cross_entropy(input=pred, target=target) + class NLLLoss(LossBase): def __init__(self, pred=None, target=None): super(NLLLoss, self).__init__() @@ -259,7 +261,7 @@ def _prepare_losser(losser): elif isinstance(losser, LossBase): return losser else: - raise TypeError(f"Type of losser should be `fastNLP.LossBase`, got {type(losser)}") + raise TypeError(f"Type of loss should be `fastNLP.LossBase`, got {type(losser)}") def squash(predict, truth, **kwargs): @@ -354,114 +356,3 @@ def make_mask(lens, tar_len): mask = torch.stack(mask, 1) return mask - -# map string to function. Just for more elegant using -method_dict = { - "squash": squash, - "unpad": unpad, - "unpad_mask": unpad_mask, - "mask": mask, -} - -loss_function_name = { - "L1Loss".lower(): torch.nn.L1Loss, - "BCELoss".lower(): torch.nn.BCELoss, - "MSELoss".lower(): torch.nn.MSELoss, - "NLLLoss".lower(): torch.nn.NLLLoss, - "KLDivLoss".lower(): torch.nn.KLDivLoss, - "NLLLoss2dLoss".lower(): torch.nn.NLLLoss2d, # every name should end with "loss" - "SmoothL1Loss".lower(): torch.nn.SmoothL1Loss, - "SoftMarginLoss".lower(): torch.nn.SoftMarginLoss, - "PoissonNLLLoss".lower(): torch.nn.PoissonNLLLoss, - "MultiMarginLoss".lower(): torch.nn.MultiMarginLoss, - "CrossEntropyLoss".lower(): torch.nn.CrossEntropyLoss, - "BCEWithLogitsLoss".lower(): torch.nn.BCEWithLogitsLoss, - "MarginRankingLoss".lower(): torch.nn.MarginRankingLoss, - "TripletMarginLoss".lower(): torch.nn.TripletMarginLoss, - "HingeEmbeddingLoss".lower(): torch.nn.HingeEmbeddingLoss, - "CosineEmbeddingLoss".lower(): torch.nn.CosineEmbeddingLoss, - "MultiLabelMarginLoss".lower(): torch.nn.MultiLabelMarginLoss, - "MultiLabelSoftMarginLoss".lower(): torch.nn.MultiLabelSoftMarginLoss, -} - - -class LossFromTorch(object): - """a LossFromTorch object is a callable object represents loss functions - - This class only helps you with loss functions from PyTorch. - It has nothing to do with Trainer. - """ - - def __init__(self, loss_name, pre_pro=[squash], **kwargs): - """ - - :param loss_name: str or None , the name of loss function - :param pre_pro : list of function or str, methods to reform parameters before calculating loss - the strings will be auto translated to pre-defined functions - :param **kwargs: kwargs for torch loss function - - pre_pro funcsions should have three arguments: predict, truth, **arg - predict and truth is the necessary parameters in loss function - kwargs is the extra parameters passed-in when calling loss function - pre_pro functions should return two objects, respectively predict and truth that after processed - - """ - - if loss_name is None: - # this is useful when Trainer.__init__ performs type check - self._loss = None - else: - if not isinstance(loss_name, str): - raise NotImplementedError - else: - self._loss = self._get_loss(loss_name, **kwargs) - - self.pre_pro = [f if callable(f) else method_dict.get(f) for f in pre_pro] - - def add_pre_pro(self, func): - """add a pre_pro function - - :param func: a function or str, methods to reform parameters before calculating loss - the strings will be auto translated to pre-defined functions - """ - if not callable(func): - func = method_dict.get(func) - if func is None: - return - self.pre_pro.append(func) - - @staticmethod - def _get_loss(loss_name, **kwargs): - """Get loss function from torch - - :param loss_name: str, the name of loss function - :param **kwargs: kwargs for torch loss function - :return: A callable loss function object - """ - loss_name = loss_name.strip().lower() - loss_name = "".join(loss_name.split("_")) - - if len(loss_name) < 4 or loss_name[-4:] != "loss": - loss_name += "loss" - return loss_function_name[loss_name](**kwargs) - - def get(self): - """This method exists just for make some existing codes run error-freely - """ - return self - - def __call__(self, predict, truth, **kwargs): - """Call a loss function - predict and truth will be processed by pre_pro methods in order of addition - - :param predict : Tensor, model output - :param truth : Tensor, truth from dataset - :param **kwargs : extra arguments, pass to pre_pro functions - for example, if used unpad_mask() in pre_pro, there should be a kwarg named lens - """ - for f in self.pre_pro: - if f is None: - continue - predict, truth = f(predict, truth, **kwargs) - - return self._loss(predict, truth) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 929d6ee1..34a90d5a 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -1,5 +1,4 @@ import inspect -import warnings from collections import defaultdict import numpy as np @@ -197,19 +196,19 @@ def _fast_param_map(self, pred_dict, target_dict): """ fast_param = {} targets = list(target_dict.values()) - if len(targets)==1 and isinstance(targets[0], torch.Tensor): - if len(pred_dict)==1: + if len(targets) == 1 and isinstance(targets[0], torch.Tensor): + if len(pred_dict) == 1: pred = list(pred_dict.values())[0] fast_param['pred'] = pred - elif len(pred_dict)==2: + elif len(pred_dict) == 2: pred1 = list(pred_dict.values())[0] pred2 = list(pred_dict.values())[1] if not (isinstance(pred1, torch.Tensor) and isinstance(pred2, torch.Tensor)): return fast_param - if len(pred1.size())len(pred2.size()) and len(pred2.size())==1: + elif len(pred1.size()) > len(pred2.size()) and len(pred2.size()) == 1: seq_lens = pred2 pred = pred1 else: @@ -308,178 +307,6 @@ def _prepare_metrics(metrics): return _metrics -""" - Attention: Codes below are not used in current FastNLP. - However, it is useful. - -""" - - -def _conver_numpy(x): - """convert input data to numpy array - - """ - if isinstance(x, np.ndarray): - return x - elif isinstance(x, torch.Tensor): - return x.numpy() - elif isinstance(x, list): - return np.array(x) - raise TypeError('cannot accept object: {}'.format(x)) - - -def _check_same_len(*arrays, axis=0): - """check if input array list has same length for one dimension - - """ - lens = set([x.shape[axis] for x in arrays if x is not None]) - return len(lens) == 1 - - -def _label_types(y): - """Determine the type - - "binary" - - "multiclass" - - "multiclass-multioutput" - - "multilabel" - - "unknown" - """ - # never squeeze the first dimension - y = y.squeeze() if y.shape[0] > 1 else y.resize(1, -1) - shape = y.shape - if len(shape) < 1: - raise ValueError('cannot accept data: {}'.format(y)) - if len(shape) == 1: - return 'multiclass' if np.unique(y).shape[0] > 2 else 'binary', y - if len(shape) == 2: - return 'multiclass-multioutput' if np.unique(y).shape[0] > 2 else 'multilabel', y - return 'unknown', y - - -def _check_data(y_true, y_pred): - """Check if y_true and y_pred is same type of data e.g both binary or multiclass - - """ - y_true, y_pred = _conver_numpy(y_true), _conver_numpy(y_pred) - if not _check_same_len(y_true, y_pred): - raise ValueError('cannot accept data with different shape {0}, {1}'.format(y_true, y_pred)) - type_true, y_true = _label_types(y_true) - type_pred, y_pred = _label_types(y_pred) - - type_set = {'binary', 'multiclass'} - if type_true in type_set and type_pred in type_set: - return type_true if type_true == type_pred else 'multiclass', y_true, y_pred - - type_set = {'multiclass-multioutput', 'multilabel'} - if type_true in type_set and type_pred in type_set: - return type_true if type_true == type_pred else 'multiclass-multioutput', y_true, y_pred - - raise ValueError('cannot accept data mixed of {0} and {1} target'.format(type_true, type_pred)) - - -def _weight_sum(y, normalize=True, sample_weight=None): - if normalize: - return np.average(y, weights=sample_weight) - if sample_weight is None: - return y.sum() - else: - return np.dot(y, sample_weight) - - -def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None): - y_type, y_true, y_pred = _check_data(y_true, y_pred) - if y_type == 'multiclass-multioutput': - raise ValueError('cannot accept data type {0}'.format(y_type)) - if y_type == 'multilabel': - equel = (y_true == y_pred).sum(1) - count = equel == y_true.shape[1] - else: - count = y_true == y_pred - return _weight_sum(count, normalize=normalize, sample_weight=sample_weight) - - -def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary'): - y_type, y_true, y_pred = _check_data(y_true, y_pred) - if average == 'binary': - if y_type != 'binary': - raise ValueError("data type is {} but use average type {}".format(y_type, average)) - else: - pos = (y_true == pos_label) - tp = np.logical_and((y_true == y_pred), pos).sum() - pos_sum = pos.sum() - return tp / pos_sum if pos_sum > 0 else 0 - elif average == None: - y_labels = set(list(np.unique(y_true))) - if labels is None: - labels = list(y_labels) - else: - for i in labels: - if (i not in y_labels and y_type != 'multilabel') or (y_type == 'multilabel' and i >= y_true.shape[1]): - warnings.warn('label {} is not contained in data'.format(i), UserWarning) - - if y_type in ['binary', 'multiclass']: - y_pred_right = y_true == y_pred - pos_list = [y_true == i for i in labels] - pos_sum_list = [pos_i.sum() for pos_i in pos_list] - return np.array([np.logical_and(y_pred_right, pos_i).sum() / sum_i if sum_i > 0 else 0 \ - for pos_i, sum_i in zip(pos_list, pos_sum_list)]) - elif y_type == 'multilabel': - y_pred_right = y_true == y_pred - pos = (y_true == pos_label) - tp = np.logical_and(y_pred_right, pos).sum(0) - pos_sum = pos.sum(0) - return np.array([tp[i] / pos_sum[i] if pos_sum[i] > 0 else 0 for i in labels]) - else: - raise ValueError('not support targets type {}'.format(y_type)) - raise ValueError('not support for average type {}'.format(average)) - - -def precision_score(y_true, y_pred, labels=None, pos_label=1, average='binary'): - y_type, y_true, y_pred = _check_data(y_true, y_pred) - if average == 'binary': - if y_type != 'binary': - raise ValueError("data type is {} but use average type {}".format(y_type, average)) - else: - pos = (y_true == pos_label) - tp = np.logical_and((y_true == y_pred), pos).sum() - pos_pred = (y_pred == pos_label).sum() - return tp / pos_pred if pos_pred > 0 else 0 - elif average == None: - y_labels = set(list(np.unique(y_true))) - if labels is None: - labels = list(y_labels) - else: - for i in labels: - if (i not in y_labels and y_type != 'multilabel') or (y_type == 'multilabel' and i >= y_true.shape[1]): - warnings.warn('label {} is not contained in data'.format(i), UserWarning) - - if y_type in ['binary', 'multiclass']: - y_pred_right = y_true == y_pred - pos_list = [y_true == i for i in labels] - pos_sum_list = [(y_pred == i).sum() for i in labels] - return np.array([np.logical_and(y_pred_right, pos_i).sum() / sum_i if sum_i > 0 else 0 \ - for pos_i, sum_i in zip(pos_list, pos_sum_list)]) - elif y_type == 'multilabel': - y_pred_right = y_true == y_pred - pos = (y_true == pos_label) - tp = np.logical_and(y_pred_right, pos).sum(0) - pos_sum = (y_pred == pos_label).sum(0) - return np.array([tp[i] / pos_sum[i] if pos_sum[i] > 0 else 0 for i in labels]) - else: - raise ValueError('not support targets type {}'.format(y_type)) - raise ValueError('not support for average type {}'.format(average)) - - -def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary'): - precision = precision_score(y_true, y_pred, labels=labels, pos_label=pos_label, average=average) - recall = recall_score(y_true, y_pred, labels=labels, pos_label=pos_label, average=average) - if isinstance(precision, np.ndarray): - res = 2 * precision * recall / (precision + recall + 1e-10) - res[(precision + recall) <= 0] = 0 - return res - return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 - - def accuracy_topk(y_true, y_prob, k=1): """Compute accuracy of y_true matching top-k probable labels in y_prob. diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index fc2edb23..0d30c6e8 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -78,6 +78,18 @@ def convert(self, data): raise NotImplementedError +@DataSet.set_reader("read_naive") +class NativeDataSetLoader(DataSetLoader): + def __init__(self): + super(NativeDataSetLoader, self).__init__() + + def load(self, path): + ds = DataSet.read_csv(path, headers=("raw_sentence", "label"), sep="\t") + ds.set_input("raw_sentence") + ds.set_target("label") + return ds + + @DataSet.set_reader('read_raw') class RawDataSetLoader(DataSetLoader): def __init__(self): diff --git a/test/core/test_loss.py b/test/core/test_loss.py index 52860b36..a6d542fa 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -1,253 +1,13 @@ -import math import unittest import torch -import torch as tc import torch.nn.functional as F import fastNLP.core.losses as loss +from fastNLP.core.losses import squash, unpad class TestLoss(unittest.TestCase): - - def test_case_1(self): - loss_func = loss.LossFunc(F.nll_loss) - nll_loss = loss.NLLLoss() - y = tc.Tensor( - [ - [.3, .4, .3], - [.5, .3, .2], - [.3, .6, .1], - ] - ) - - gy = tc.LongTensor( - [ - 0, - 1, - 2, - ] - ) - - y = tc.log(y) - los = loss_func({'input': y}, {'target': gy}) - losses = nll_loss({'input': y}, {'target': gy}) - - r = -math.log(.3) - math.log(.3) - math.log(.1) - r /= 3 - print("loss = %f" % (los)) - print("r = %f" % (r)) - print("nll_loss = %f" % (losses)) - - self.assertEqual(int(los * 1000), int(r * 1000)) - - def test_case_2(self): - # 验证squash()的正确性 - - log = math.log - loss_func = loss.LossFromTorch("nll") - - y = tc.Tensor( - [ - [[.3, .4, .3], [.3, .4, .3], ], - [[.5, .3, .2], [.1, .2, .7], ], - [[.3, .6, .1], [.2, .1, .7], ], - ] - ) - - gy = tc.LongTensor( - [ - [0, 2], - [1, 2], - [2, 1], - ] - ) - - y = tc.log(y) - # los = loss_func({'input': y}, {'target': gy}) - los = loss_func(y, gy) - - r = -log(.3) - log(.3) - log(.1) - log(.3) - log(.7) - log(.1) - r /= 6 - - self.assertEqual(int(los * 1000), int(r * 1000)) - - def test_case_3(self): - # 验证pack_padded_sequence()的正确性 - log = math.log - loss_func = loss.NLLLoss() - y = tc.Tensor( - [ - [[.3, .4, .3], [.3, .2, .5], [.4, .5, .1, ], ], - [[.5, .3, .2], [.1, .2, .7], [.0, .0, .0, ], ], - [[.3, .6, .1], [.0, .0, .0], [.0, .0, .0, ], ], - ] - ) - - gy = tc.LongTensor( - [ - [0, 2, 1, ], - [1, 2, 0, ], - [2, 0, 0, ], - ] - ) - - lens = [3, 2, 1] - - # pdb.set_trace() - - y = tc.log(y) - - yy = tc.nn.utils.rnn.pack_padded_sequence(y, lens, batch_first=True).data - gyy = tc.nn.utils.rnn.pack_padded_sequence(gy, lens, batch_first=True).data - los = loss_func({'input': yy}, {'target': gyy}) - - r = -log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1) - r /= 6 - - self.assertEqual(int(los * 1000), int(r * 1000)) - - def test_case_4(self): - # 验证unpad()的正确性 - log = math.log - y = tc.Tensor( - [ - [[.3, .4, .3], [.3, .2, .5], [.4, .5, .1, ], [.6, .3, .1, ], ], - [[.5, .3, .2], [.1, .2, .7], [.0, .0, .0, ], [.0, .0, .0, ], ], - [[.3, .6, .1], [.0, .0, .0], [.0, .0, .0, ], [.0, .0, .0, ], ], - ] - ) - - gy = tc.LongTensor( - [ - [0, 2, 1, 2, ], - [1, 2, 0, 0, ], - [2, 0, 0, 0, ], - ] - ) - - lens = [4, 2, 1] - y = tc.log(y) - - loss_func = loss.LossFromTorch("nll", pre_pro=["unpad"]) - los = loss_func(y, gy, lens=lens) - - r = -log(.1) - log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1) - r /= 7 - - self.assertEqual(int(los * 1000), int(r * 1000)) - - def test_case_5(self): - # 验证mask()和make_mask()的正确性 - log = math.log - - y = tc.Tensor( - [ - [[.5, .3, .2], [.1, .2, .7], [.0, .0, .0, ], [.0, .0, .0, ], ], - [[.5, .4, .1], [.3, .2, .5], [.4, .5, .1, ], [.6, .1, .3, ], ], - [[.3, .6, .1], [.3, .2, .5], [.0, .0, .0, ], [.0, .0, .0, ], ], - ] - ) - - gy = tc.LongTensor( - [ - [1, 2, 0, 0, ], - [0, 2, 1, 2, ], - [2, 1, 0, 0, ], - ] - ) - - mask = tc.ByteTensor( - [ - [1, 1, 0, 0, ], - [1, 1, 1, 1, ], - [1, 1, 0, 0, ], - ] - ) - - y = tc.log(y) - - lens = [2, 4, 2] - - loss_func = loss.LossFromTorch("nll", pre_pro=["mask"]) - los = loss_func(y, gy, mask=mask) - - los2 = loss_func(y, gy, mask=loss.make_mask(lens, gy.size()[-1])) - - r = -log(.3) - log(.7) - log(.5) - log(.5) - log(.5) - log(.3) - log(.1) - log(.2) - r /= 8 - - self.assertEqual(int(los * 1000), int(r * 1000)) - self.assertEqual(int(los2 * 1000), int(r * 1000)) - - def test_case_6(self): - # 验证unpad_mask()的正确性 - log = math.log - y = tc.Tensor( - [ - [[.3, .4, .3], [.3, .2, .5], [.4, .5, .1, ], [.6, .3, .1, ], ], - [[.5, .3, .2], [.1, .2, .7], [.0, .0, .0, ], [.0, .0, .0, ], ], - [[.3, .6, .1], [.0, .0, .0], [.0, .0, .0, ], [.0, .0, .0, ], ], - ] - ) - - gy = tc.LongTensor( - [ - [0, 2, 1, 2, ], - [1, 2, 0, 0, ], - [2, 0, 0, 0, ], - ] - ) - - lens = [4, 2, 1] - - # pdb.set_trace() - - y = tc.log(y) - - loss_func = loss.LossFromTorch("nll", pre_pro=["unpad_mask"]) - los = loss_func(y, gy, lens=lens) - - r = -log(.1) - log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1) - r /= 7 - - self.assertEqual(int(los * 1000), int(r * 1000)) - - def test_case_7(self): - # 验证一些其他东西 - log = math.log - y = tc.Tensor( - [ - [[.3, .4, .3], [.3, .2, .5], [.4, .5, .1, ], [.6, .3, .1, ], ], - [[.5, .3, .2], [.1, .2, .7], [.0, .0, .0, ], [.0, .0, .0, ], ], - [[.3, .6, .1], [.0, .0, .0], [.0, .0, .0, ], [.0, .0, .0, ], ], - ] - ) - - gy = tc.LongTensor( - [ - [0, 2, 1, 2, ], - [1, 2, 0, 0, ], - [2, 0, 0, 0, ], - ] - ) - - lens = [4, 2, 1] - y = tc.log(y) - - loss_func = loss.LossFromTorch("nll", pre_pro=[], weight=tc.Tensor([1, 1, 0])) - loss_func.add_pre_pro("unpad_mask") - los = loss_func(y, gy, lens=lens) - - r = - log(.3) - log(.5) - log(.3) - r /= 3 - self.assertEqual(int(los * 1000), int(r * 1000)) - - def test_case_8(self): - pass - - -class TestLoss_v2(unittest.TestCase): def test_CrossEntropyLoss(self): ce = loss.CrossEntropyLoss(pred="my_predict", target="my_truth") a = torch.randn(3, 5, requires_grad=False) @@ -276,6 +36,7 @@ def test_NLLLoss(self): ans = l1({"my_predict": a}, {"my_truth": b}) self.assertEqual(ans, torch.nn.functional.nll_loss(a, b)) + class TestLosserError(unittest.TestCase): def test_losser1(self): # (1) only input, targets passed @@ -292,11 +53,12 @@ def test_losser2(self): target_dict = {'target': torch.zeros(16, 3).long()} los = loss.CrossEntropyLoss() - # print(los(pred_dict=pred_dict, target_dict=target_dict)) + with self.assertRaises(RuntimeError): + print(los(pred_dict=pred_dict, target_dict=target_dict)) def test_losser3(self): # (2) with corrupted size - pred_dict = {"pred": torch.zeros(16, 3), 'stop_fast_param':0} + pred_dict = {"pred": torch.zeros(16, 3), 'stop_fast_param': 0} target_dict = {'target': torch.zeros(16).long()} los = loss.CrossEntropyLoss() @@ -311,3 +73,15 @@ def test_check_error(self): with self.assertRaises(Exception): ans = l1({"my_predict": a}, {"truth": b, "my": a}) + + +class TestLossUtils(unittest.TestCase): + def test_squash(self): + a, b = squash(torch.randn(3, 5), torch.randn(3, 5)) + self.assertEqual(tuple(a.size()), (3, 5)) + self.assertEqual(tuple(b.size()), (15,)) + + def test_unpad(self): + a, b = unpad(torch.randn(5, 8, 3), torch.randn(5, 8)) + self.assertEqual(tuple(a.size()), (5, 8, 3)) + self.assertEqual(tuple(b.size()), (5, 8)) diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py index d2e45379..c6267664 100644 --- a/test/core/test_metrics.py +++ b/test/core/test_metrics.py @@ -4,7 +4,7 @@ import torch from fastNLP.core.metrics import AccuracyMetric -from fastNLP.core.metrics import accuracy_score, recall_score, precision_score, f1_score, pred_topk, accuracy_topk +from fastNLP.core.metrics import pred_topk, accuracy_topk class TestAccuracyMetric(unittest.TestCase): @@ -139,10 +139,6 @@ class TestUsefulFunctions(unittest.TestCase): # 测试metrics.py中一些看上去挺有用的函数 def test_case_1(self): # multi-class - _ = accuracy_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1))) - _ = precision_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None) - _ = recall_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None) - _ = f1_score(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), average=None) _ = accuracy_topk(np.random.randint(0, 3, size=(10, 1)), np.random.randint(0, 3, size=(10, 1)), k=3) _ = pred_topk(np.random.randint(0, 3, size=(10, 1))) diff --git a/tutorials/fastnlp_in_six_lines.ipynb b/tutorials/fastnlp_in_six_lines.ipynb new file mode 100644 index 00000000..2d8f40d7 --- /dev/null +++ b/tutorials/fastnlp_in_six_lines.ipynb @@ -0,0 +1,81 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# 六行代码搞定FastNLP" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fastNLP.core.dataset import DataSet\n", + "import fastNLP.io.dataset_loader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = DataSet.read_naive(\"../test/data_for_tests/tutorial_sample_dataset.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 720a264eb3035a2acf99a9a3d5ef096f16de75be Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Fri, 7 Dec 2018 14:53:27 +0800 Subject: [PATCH 65/67] * rename DataSet.get_fields() into get_all_fields() * add DataSet.get_field(), to fetch a FieldArray based on its name * remove old tutorials & add new tutorials --- fastNLP/api/processor.py | 4 +- fastNLP/core/batch.py | 2 +- fastNLP/core/dataset.py | 7 +- fastNLP/core/sampler.py | 2 +- fastNLP/models/cnn_text_classification.py | 6 +- test/core/test_dataset.py | 16 + .../tutorial_sample_dataset.csv | 41 +- tutorials/fastnlp_10min_tutorial_v2.ipynb | 911 ++++++++++++++++++ tutorials/fastnlp_10tmin_tutorial.ipynb | 860 +++++++++++++++++ tutorials/fastnlp_1_minute_tutorial.ipynb | 333 +++++++ ....ipynb => fastnlp_advanced_tutorial.ipynb} | 64 +- tutorials/fastnlp_tutorial_1203.ipynb | 526 ---------- tutorials/fastnlp_tutorial_1204.ipynb | 447 --------- 13 files changed, 2215 insertions(+), 1004 deletions(-) create mode 100644 tutorials/fastnlp_10min_tutorial_v2.ipynb create mode 100644 tutorials/fastnlp_10tmin_tutorial.ipynb create mode 100644 tutorials/fastnlp_1_minute_tutorial.ipynb rename tutorials/{fastnlp_in_six_lines.ipynb => fastnlp_advanced_tutorial.ipynb} (53%) delete mode 100644 tutorials/fastnlp_tutorial_1203.ipynb delete mode 100644 tutorials/fastnlp_tutorial_1204.ipynb diff --git a/fastNLP/api/processor.py b/fastNLP/api/processor.py index d6a68412..fcda3e7c 100644 --- a/fastNLP/api/processor.py +++ b/fastNLP/api/processor.py @@ -268,7 +268,7 @@ def __init__(self, field_dict, default=False): self.default = default def process(self, dataset): - set_dict = {name: self.default for name in dataset.get_fields().keys()} + set_dict = {name: self.default for name in dataset.get_all_fields().keys()} set_dict.update(self.field_dict) dataset._set_need_tensor(**set_dict) return dataset @@ -282,7 +282,7 @@ def __init__(self, field_dict, default=False): self.default = default def process(self, dataset): - set_dict = {name: self.default for name in dataset.get_fields().keys()} + set_dict = {name: self.default for name in dataset.get_all_fields().keys()} set_dict.update(self.field_dict) dataset.set_target(**set_dict) return dataset diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 1e7d56fd..1bb26129 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -43,7 +43,7 @@ def __next__(self): indices = self.idx_list[self.curidx:endidx] - for field_name, field in self.dataset.get_fields().items(): + for field_name, field in self.dataset.get_all_fields().items(): if field.is_target or field.is_input: batch = field.get(indices) if not self.as_numpy: diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index f4963d0a..d4d285d7 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -157,7 +157,12 @@ def delete_field(self, name): """ self.field_arrays.pop(name) - def get_fields(self): + def get_field(self, field_name): + if field_name not in self.field_arrays: + raise KeyError("Field name {} not found in DataSet".format(field_name)) + return self.field_arrays[field_name] + + def get_all_fields(self): """Return all the fields with their names. :return dict field_arrays: the internal data structure of DataSet. diff --git a/fastNLP/core/sampler.py b/fastNLP/core/sampler.py index d568acf3..766d71a7 100644 --- a/fastNLP/core/sampler.py +++ b/fastNLP/core/sampler.py @@ -55,7 +55,7 @@ def __init__(self, num_buckets=10, batch_size=32, seq_lens_field_name='seq_lens' def __call__(self, data_set): - seq_lens = data_set.get_fields()[self.seq_lens_field_name].content + seq_lens = data_set.get_all_fields()[self.seq_lens_field_name].content total_sample_num = len(seq_lens) bucket_indexes = [] diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py index c8fe5181..f3898c00 100644 --- a/fastNLP/models/cnn_text_classification.py +++ b/fastNLP/models/cnn_text_classification.py @@ -44,7 +44,7 @@ def forward(self, word_seq): x = self.conv_pool(x) # [N,L,C] -> [N,C] x = self.dropout(x) x = self.fc(x) # [N,C] -> [N, N_class] - return {'output': x} + return {'pred': x} def predict(self, word_seq): """ @@ -53,5 +53,5 @@ def predict(self, word_seq): :return predict: dict of torch.LongTensor, [batch_size, seq_len] """ output = self(word_seq) - _, predict = output['output'].max(dim=1) - return {'predict': predict} + _, predict = output['pred'].max(dim=1) + return {'pred': predict} diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index 9527e8ee..74ad5958 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -2,6 +2,7 @@ import unittest from fastNLP.core.dataset import DataSet +from fastNLP.core.fieldarray import FieldArray from fastNLP.core.instance import Instance @@ -162,6 +163,21 @@ def test_save_load(self): ds_1 = DataSet.load("./my_ds.pkl") os.remove("my_ds.pkl") + def test_get_all_fields(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) + ans = ds.get_all_fields() + self.assertEqual(ans["x"].content, [[1, 2, 3, 4]] * 10) + self.assertEqual(ans["y"].content, [[5, 6]] * 10) + + def test_get_field(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) + ans = ds.get_field("x") + self.assertTrue(isinstance(ans, FieldArray)) + self.assertEqual(ans.content, [[1, 2, 3, 4]] * 10) + ans = ds.get_field("y") + self.assertTrue(isinstance(ans, FieldArray)) + self.assertEqual(ans.content, [[5, 6]] * 10) + class TestDataSetIter(unittest.TestCase): def test__repr__(self): diff --git a/test/data_for_tests/tutorial_sample_dataset.csv b/test/data_for_tests/tutorial_sample_dataset.csv index c3137854..e5c0a74f 100644 --- a/test/data_for_tests/tutorial_sample_dataset.csv +++ b/test/data_for_tests/tutorial_sample_dataset.csv @@ -35,4 +35,43 @@ There 's very little sense to what 's going on here , but the makers serve up th Cattaneo should have followed the runaway success of his first film , The Full Monty , with something different . 2 They 're the unnamed , easily substitutable forces that serve as whatever terror the heroes of horror movies try to avoid . 1 It almost feels as if the movie is more interested in entertaining itself than in amusing us . 1 -The movie 's progression into rambling incoherence gives new meaning to the phrase ` fatal script error . ' 0 \ No newline at end of file +The movie 's progression into rambling incoherence gives new meaning to the phrase ` fatal script error . ' 0 +I still like Moonlight Mile , better judgment be damned . 3 +A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3 +a bilingual charmer , just like the woman who inspired it 3 +Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2 +As inept as big-screen remakes of The Avengers and The Wild Wild West . 1 +It 's everything you 'd expect -- but nothing more . 2 +Best indie of the year , so far . 4 +Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3 +It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1 +That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2 +The plot is romantic comedy boilerplate from start to finish . 2 +It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2 +A film that clearly means to preach exclusively to the converted . 2 +I still like Moonlight Mile , better judgment be damned . 3 +A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3 +a bilingual charmer , just like the woman who inspired it 3 +Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2 +As inept as big-screen remakes of The Avengers and The Wild Wild West . 1 +It 's everything you 'd expect -- but nothing more . 2 +Best indie of the year , so far . 4 +Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3 +It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1 +That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2 +The plot is romantic comedy boilerplate from start to finish . 2 +It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2 +A film that clearly means to preach exclusively to the converted . 2 +I still like Moonlight Mile , better judgment be damned . 3 +A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3 +a bilingual charmer , just like the woman who inspired it 3 +Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2 +As inept as big-screen remakes of The Avengers and The Wild Wild West . 1 +It 's everything you 'd expect -- but nothing more . 2 +Best indie of the year , so far . 4 +Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3 +It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1 +That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2 +The plot is romantic comedy boilerplate from start to finish . 2 +It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2 +A film that clearly means to preach exclusively to the converted . 2 \ No newline at end of file diff --git a/tutorials/fastnlp_10min_tutorial_v2.ipynb b/tutorials/fastnlp_10min_tutorial_v2.ipynb new file mode 100644 index 00000000..f86e5bf3 --- /dev/null +++ b/tutorials/fastnlp_10min_tutorial_v2.ipynb @@ -0,0 +1,911 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "fastNLP上手教程\n", + "-------\n", + "\n", + "fastNLP提供方便的数据预处理,训练和测试模型的功能" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DataSet & Instance\n", + "------\n", + "\n", + "fastNLP用DataSet和Instance保存和处理数据。每个DataSet表示一个数据集,每个Instance表示一个数据样本。一个DataSet存有多个Instance,每个Instance可以自定义存哪些内容。\n", + "\n", + "有一些read_*方法,可以轻松从文件读取数据,存成DataSet。" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8529" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from fastNLP import DataSet\n", + "from fastNLP import Instance\n", + "\n", + "# 从csv读取数据到DataSet\n", + "dataset = DataSet.read_csv('../sentence.csv', headers=('raw_sentence', 'label'), sep='\\t')\n", + "print(len(dataset))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n'label': 1}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# 使用数字索引[k],获取第k个样本\n", + "print(dataset[0])\n", + "\n", + "# 索引也可以是负数\n", + "print(dataset[-3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instance\n", + "Instance表示一个样本,由一个或多个field(域,属性,特征)组成,每个field有名字和值。\n", + "\n", + "在初始化Instance时即可定义它包含的域,使用 \"field_name=field_value\"的写法。" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'raw_sentence': fake data,\n'label': 0}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DataSet.append(Instance)加入新数据\n", + "dataset.append(Instance(raw_sentence='fake data', label='0'))\n", + "dataset[-1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataSet.apply方法\n", + "数据预处理利器" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n'label': 1}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# 将所有数字转为小写\n", + "dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')\n", + "print(dataset[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n'label': 1}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# label转int\n", + "dataset.apply(lambda x: int(x['label']), new_field_name='label')\n", + "print(dataset[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n'label': 1,\n'words': ['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.']}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# 使用空格分割句子\n", + "def split_sent(ins):\n", + " return ins['raw_sentence'].split()\n", + "dataset.apply(split_sent, new_field_name='words')\n", + "print(dataset[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n'label': 1,\n'words': ['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.'],\n'seq_len': 37}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# 增加长度信息\n", + "dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')\n", + "print(dataset[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataSet.drop\n", + "筛选数据" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8358" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "dataset.drop(lambda x: x['seq_len'] <= 3)\n", + "print(len(dataset))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 配置DataSet\n", + "1. 哪些域是特征,哪些域是标签\n", + "2. 切分训练集/验证集" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# 设置DataSet中,哪些field要转为tensor\n", + "\n", + "# set target,loss或evaluate中的golden,计算loss,模型评估时使用\n", + "dataset.set_target(\"label\")\n", + "# set input,模型forward时使用\n", + "dataset.set_input(\"words\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5851" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2507" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# 分出测试集、训练集\n", + "\n", + "test_data, train_data = dataset.split(0.3)\n", + "print(len(test_data))\n", + "print(len(train_data))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Vocabulary\n", + "------\n", + "\n", + "fastNLP中的Vocabulary轻松构建词表,将词转成数字" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': the project 's filmmakers forgot to include anything even halfway scary as they poorly rejigger fatal attraction into a high school setting .,\n'label': 0,\n'words': [4, 423, 9, 316, 1, 8, 1, 312, 72, 1478, 885, 14, 86, 725, 1, 1913, 1431, 53, 5, 455, 736, 1, 2],\n'seq_len': 23}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from fastNLP import Vocabulary\n", + "\n", + "# 构建词表, Vocabulary.add(word)\n", + "vocab = Vocabulary(min_freq=2)\n", + "train_data.apply(lambda x: [vocab.add(word) for word in x['words']])\n", + "vocab.build_vocab()\n", + "\n", + "# index句子, Vocabulary.to_index(word)\n", + "train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')\n", + "test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')\n", + "\n", + "\n", + "print(test_data[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model\n", + "定义一个PyTorch模型" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CNNText(\n (embed): Embedding(\n (embed): Embedding(3459, 50, padding_idx=0)\n (dropout): Dropout(p=0.0)\n )\n (conv_pool): ConvMaxpool(\n (convs): ModuleList(\n (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n )\n )\n (dropout): Dropout(p=0.1)\n (fc): Linear(\n (linear): Linear(in_features=12, out_features=5, bias=True)\n )\n)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from fastNLP.models import CNNText\n", + "model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)\n", + "model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这是上述模型的forward方法。如果你不知道什么是forward方法,请参考我们的PyTorch教程。\n", + "\n", + "注意两点:\n", + "1. forward参数名字叫**word_seq**,请记住。\n", + "2. forward的返回值是一个**dict**,其中有个key的名字叫**output**。\n", + "\n", + "```Python\n", + " def forward(self, word_seq):\n", + " \"\"\"\n", + "\n", + " :param word_seq: torch.LongTensor, [batch_size, seq_len]\n", + " :return output: dict of torch.LongTensor, [batch_size, num_classes]\n", + " \"\"\"\n", + " x = self.embed(word_seq) # [N,L] -> [N,L,C]\n", + " x = self.conv_pool(x) # [N,L,C] -> [N,C]\n", + " x = self.dropout(x)\n", + " x = self.fc(x) # [N,C] -> [N, N_class]\n", + " return {'output': x}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这是上述模型的predict方法,是用来直接输出该任务的预测结果,与forward目的不同。\n", + "\n", + "注意两点:\n", + "1. predict参数名也叫**word_seq**。\n", + "2. predict的返回值是也一个**dict**,其中有个key的名字叫**predict**。\n", + "\n", + "```\n", + " def predict(self, word_seq):\n", + " \"\"\"\n", + "\n", + " :param word_seq: torch.LongTensor, [batch_size, seq_len]\n", + " :return predict: dict of torch.LongTensor, [batch_size, seq_len]\n", + " \"\"\"\n", + " output = self(word_seq)\n", + " _, predict = output['output'].max(dim=1)\n", + " return {'predict': predict}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Trainer & Tester\n", + "------\n", + "\n", + "使用fastNLP的Trainer训练模型" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "from fastNLP import Trainer\n", + "from copy import deepcopy\n", + "from fastNLP.core.losses import CrossEntropyLoss\n", + "from fastNLP.core.metrics import AccuracyMetric\n", + "\n", + "\n", + "# 更改DataSet中对应field的名称,与模型的forward的参数名一致\n", + "# 因为forward的参数叫word_seq, 所以要把原本叫words的field改名为word_seq\n", + "# 这里的演示是让你了解这种**命名规则**\n", + "train_data.rename_field('words', 'word_seq')\n", + "test_data.rename_field('words', 'word_seq')\n", + "\n", + "# 顺便把label换名为label_seq\n", + "train_data.rename_field('label', 'label_seq')\n", + "test_data.rename_field('label', 'label_seq')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### loss\n", + "训练模型需要提供一个损失函数\n", + "\n", + "下面提供了一个在分类问题中常用的交叉熵损失。注意它的**初始化参数**。\n", + "\n", + "pred参数对应的是模型的forward返回的dict的一个key的名字,这里是\"output\"。\n", + "\n", + "target参数对应的是dataset作为标签的field的名字,这里是\"label_seq\"。" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "loss = CrossEntropyLoss(pred=\"output\", target=\"label_seq\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Metric\n", + "定义评价指标\n", + "\n", + "这里使用准确率。参数的“命名规则”跟上面类似。\n", + "\n", + "pred参数对应的是模型的predict方法返回的dict的一个key的名字,这里是\"predict\"。\n", + "\n", + "target参数对应的是dataset作为标签的field的名字,这里是\"label_seq\"。" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "metric = AccuracyMetric(pred=\"predict\", target=\"label_seq\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "training epochs started 2018-12-07 14:11:31" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=915), HTML(value='')), layout=Layout(display=…" + ] + }, + "execution_count": 0, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5. Step:183/915. AccuracyMetric: acc=0.350367" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 2/5. Step:366/915. AccuracyMetric: acc=0.409332" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 3/5. Step:549/915. AccuracyMetric: acc=0.572552" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 4/5. Step:732/915. AccuracyMetric: acc=0.711331" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 5/5. Step:915/915. AccuracyMetric: acc=0.801572" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + } + ], + "source": [ + "# 实例化Trainer,传入模型和数据,进行训练\n", + "# 先在test_data拟合\n", + "copy_model = deepcopy(model)\n", + "overfit_trainer = Trainer(model=copy_model, train_data=test_data, dev_data=test_data,\n", + " loss=loss,\n", + " metrics=metric,\n", + " save_path=None,\n", + " batch_size=32,\n", + " n_epochs=5)\n", + "overfit_trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "training epochs started 2018-12-07 14:12:21" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=395), HTML(value='')), layout=Layout(display=…" + ] + }, + "execution_count": 0, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5. Step:79/395. AccuracyMetric: acc=0.250043" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 2/5. Step:158/395. AccuracyMetric: acc=0.280807" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 3/5. Step:237/395. AccuracyMetric: acc=0.280978" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 4/5. Step:316/395. AccuracyMetric: acc=0.285592" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 5/5. Step:395/395. AccuracyMetric: acc=0.278927" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + } + ], + "source": [ + "# 用train_data训练,在test_data验证\n", + "trainer = Trainer(model=model, train_data=train_data, dev_data=test_data,\n", + " loss=CrossEntropyLoss(pred=\"output\", target=\"label_seq\"),\n", + " metrics=AccuracyMetric(pred=\"predict\", target=\"label_seq\"),\n", + " save_path=None,\n", + " batch_size=32,\n", + " n_epochs=5)\n", + "trainer.train()\n", + "print('Train finished!')" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[tester] \nAccuracyMetric: acc=0.280636" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AccuracyMetric': {'acc': 0.280636}}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# 调用Tester在test_data上评价效果\n", + "from fastNLP import Tester\n", + "\n", + "tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred=\"predict\", target=\"label_seq\"),\n", + " batch_size=4)\n", + "acc = tester.test()\n", + "print(acc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/fastnlp_10tmin_tutorial.ipynb b/tutorials/fastnlp_10tmin_tutorial.ipynb new file mode 100644 index 00000000..bad29f55 --- /dev/null +++ b/tutorials/fastnlp_10tmin_tutorial.ipynb @@ -0,0 +1,860 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "fastNLP上手教程\n", + "-------\n", + "\n", + "fastNLP提供方便的数据预处理,训练和测试模型的功能" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DataSet & Instance\n", + "------\n", + "\n", + "fastNLP用DataSet和Instance保存和处理数据。每个DataSet表示一个数据集,每个Instance表示一个数据样本。一个DataSet存有多个Instance,每个Instance可以自定义存哪些内容。\n", + "\n", + "有一些read_*方法,可以轻松从文件读取数据,存成DataSet。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n'label': 1}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from fastNLP import DataSet\n", + "from fastNLP import Instance\n", + "\n", + "# 从csv读取数据到DataSet\n", + "win_path = \"C:\\\\Users\\zyfeng\\Desktop\\FudanNLP\\\\fastNLP\\\\test\\\\data_for_tests\\\\tutorial_sample_dataset.csv\"\n", + "dataset = DataSet.read_csv(win_path, headers=('raw_sentence', 'label'), sep='\\t')\n", + "print(dataset[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'raw_sentence': fake data,\n'label': 0}" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DataSet.append(Instance)加入新数据\n", + "\n", + "dataset.append(Instance(raw_sentence='fake data', label='0'))\n", + "dataset[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# DataSet.apply(func, new_field_name)对数据预处理\n", + "\n", + "# 将所有数字转为小写\n", + "dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')\n", + "# label转int\n", + "dataset.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True)\n", + "# 使用空格分割句子\n", + "dataset.drop(lambda x: len(x['raw_sentence'].split()) == 0)\n", + "def split_sent(ins):\n", + " return ins['raw_sentence'].split()\n", + "dataset.apply(split_sent, new_field_name='words', is_input=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# DataSet.drop(func)筛除数据\n", + "# 删除低于某个长度的词语\n", + "dataset.drop(lambda x: len(x['words']) <= 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train size: " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "54" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test size: " + ] + } + ], + "source": [ + "# 分出测试集、训练集\n", + "\n", + "test_data, train_data = dataset.split(0.3)\n", + "print(\"Train size: \", len(test_data))\n", + "print(\"Test size: \", len(train_data))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Vocabulary\n", + "------\n", + "\n", + "fastNLP中的Vocabulary轻松构建词表,将词转成数字" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'raw_sentence': the plot is romantic comedy boilerplate from start to finish .,\n'label': 2,\n'label_seq': 2,\n'words': ['the', 'plot', 'is', 'romantic', 'comedy', 'boilerplate', 'from', 'start', 'to', 'finish', '.'],\n'word_seq': [2, 13, 9, 24, 25, 26, 15, 27, 11, 28, 3]}" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from fastNLP import Vocabulary\n", + "\n", + "# 构建词表, Vocabulary.add(word)\n", + "vocab = Vocabulary(min_freq=2)\n", + "train_data.apply(lambda x: [vocab.add(word) for word in x['words']])\n", + "vocab.build_vocab()\n", + "\n", + "# index句子, Vocabulary.to_index(word)\n", + "train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True)\n", + "test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True)\n", + "\n", + "\n", + "print(test_data[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "batch_x has: {'words': array([list(['this', 'kind', 'of', 'hands-on', 'storytelling', 'is', 'ultimately', 'what', 'makes', 'shanghai', 'ghetto', 'move', 'beyond', 'a', 'good', ',', 'dry', ',', 'reliable', 'textbook', 'and', 'what', 'allows', 'it', 'to', 'rank', 'with', 'its', 'worthy', 'predecessors', '.']),\n", + " list(['the', 'entire', 'movie', 'is', 'filled', 'with', 'deja', 'vu', 'moments', '.'])],\n", + " dtype=object), 'word_seq': tensor([[ 19, 184, 6, 1, 481, 9, 206, 50, 91, 1210, 1609, 1330,\n", + " 495, 5, 63, 4, 1269, 4, 1, 1184, 7, 50, 1050, 10,\n", + " 8, 1611, 16, 21, 1039, 1, 2],\n", + " [ 3, 711, 22, 9, 1282, 16, 2482, 2483, 200, 2, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0]])}\n", + "batch_y has: {'label_seq': tensor([3, 2])}\n" + ] + } + ], + "source": [ + "# 假设你们需要做强化学习或者gan之类的项目,也许你们可以使用这里的dataset\n", + "from fastNLP.core.batch import Batch\n", + "from fastNLP.core.sampler import RandomSampler\n", + "\n", + "batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler())\n", + "for batch_x, batch_y in batch_iterator:\n", + " print(\"batch_x has: \", batch_x)\n", + " print(\"batch_y has: \", batch_y)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "CNNText(\n (embed): Embedding(\n (embed): Embedding(77, 50, padding_idx=0)\n (dropout): Dropout(p=0.0)\n )\n (conv_pool): ConvMaxpool(\n (convs): ModuleList(\n (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n )\n )\n (dropout): Dropout(p=0.1)\n (fc): Linear(\n (linear): Linear(in_features=12, out_features=5, bias=True)\n )\n)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 定义一个简单的Pytorch模型\n", + "\n", + "from fastNLP.models import CNNText\n", + "model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)\n", + "model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Trainer & Tester\n", + "------\n", + "\n", + "使用fastNLP的Trainer训练模型" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from fastNLP import Trainer\n", + "from copy import deepcopy\n", + "from fastNLP import CrossEntropyLoss\n", + "from fastNLP import AccuracyMetric" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "training epochs started 2018-12-07 14:07:20" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…" + ] + }, + "execution_count": 0, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.037037" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.296296" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.333333" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.555556" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.611111" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.481481" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.62963" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.685185" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.722222" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.777778" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + } + ], + "source": [ + "# 进行overfitting测试\n", + "copy_model = deepcopy(model)\n", + "overfit_trainer = Trainer(model=copy_model, \n", + " train_data=test_data, \n", + " dev_data=test_data,\n", + " loss=CrossEntropyLoss(pred=\"output\", target=\"label_seq\"),\n", + " metrics=AccuracyMetric(),\n", + " n_epochs=10,\n", + " save_path=None)\n", + "overfit_trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "training epochs started 2018-12-07 14:08:10" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=5), HTML(value='')), layout=Layout(display='i…" + ] + }, + "execution_count": 0, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5. Step:1/5. AccuracyMetric: acc=0.037037" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 2/5. Step:2/5. AccuracyMetric: acc=0.037037" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 3/5. Step:3/5. AccuracyMetric: acc=0.037037" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 4/5. Step:4/5. AccuracyMetric: acc=0.185185" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 5/5. Step:5/5. AccuracyMetric: acc=0.240741" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train finished!" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# 实例化Trainer,传入模型和数据,进行训练\n", + "trainer = Trainer(model=model, \n", + " train_data=train_data, \n", + " dev_data=test_data,\n", + " loss=CrossEntropyLoss(pred=\"output\", target=\"label_seq\"),\n", + " metrics=AccuracyMetric(),\n", + " n_epochs=5)\n", + "trainer.train()\n", + "print('Train finished!')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[tester] \nAccuracyMetric: acc=0.240741" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from fastNLP import Tester\n", + "\n", + "tester = Tester(data=test_data, model=model, metrics=AccuracyMetric())\n", + "acc = tester.test()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# In summary\n", + "\n", + "## fastNLP Trainer的伪代码逻辑\n", + "### 1. 准备DataSet,假设DataSet中共有如下的fields\n", + " ['raw_sentence', 'word_seq1', 'word_seq2', 'raw_label','label']\n", + " 通过\n", + " DataSet.set_input('word_seq1', word_seq2', flag=True)将'word_seq1', 'word_seq2'设置为input\n", + " 通过\n", + " DataSet.set_target('label', flag=True)将'label'设置为target\n", + "### 2. 初始化模型\n", + " class Model(nn.Module):\n", + " def __init__(self):\n", + " xxx\n", + " def forward(self, word_seq1, word_seq2):\n", + " # (1) 这里使用的形参名必须和DataSet中的input field的名称对应。因为我们是通过形参名, 进行赋值的\n", + " # (2) input field的数量可以多于这里的形参数量。但是不能少于。\n", + " xxxx\n", + " # 输出必须是一个dict\n", + "### 3. Trainer的训练过程\n", + " (1) 从DataSet中按照batch_size取出一个batch,调用Model.forward\n", + " (2) 将 Model.forward的结果 与 标记为target的field 传入Losser当中。\n", + " 由于每个人写的Model.forward的output的dict可能key并不一样,比如有人是{'pred':xxx}, {'output': xxx}; \n", + " 另外每个人将target可能也会设置为不同的名称, 比如有人是label, 有人设置为target;\n", + " 为了解决以上的问题,我们的loss提供映射机制\n", + " 比如CrossEntropyLosser的需要的输入是(prediction, target)。但是forward的output是{'output': xxx}; 'label'是target\n", + " 那么初始化losser的时候写为CrossEntropyLosser(prediction='output', target='label')即可\n", + " (3) 对于Metric是同理的\n", + " Metric计算也是从 forward的结果中取值 与 设置target的field中取值。 也是可以通过映射找到对应的值 \n", + " \n", + " \n", + "\n", + "## 一些问题.\n", + "### 1. DataSet中为什么需要设置input和target\n", + " 只有被设置为input或者target的数据才会在train的过程中被取出来\n", + " (1.1) 我们只会在设置为input的field中寻找传递给Model.forward的参数。\n", + " (1.2) 我们在传递值给losser或者metric的时候会使用来自: \n", + " (a)Model.forward的output\n", + " (b)被设置为target的field\n", + " \n", + "\n", + "### 2. 我们是通过forwad中的形参名将DataSet中的field赋值给对应的参数\n", + " (1.1) 构建模型过程中,\n", + " 例如:\n", + " DataSet中x,seq_lens是input,那么forward就应该是\n", + " def forward(self, x, seq_lens):\n", + " pass\n", + " 我们是通过形参名称进行匹配的field的\n", + " \n", + "\n", + "\n", + "### 1. 加载数据到DataSet\n", + "### 2. 使用apply操作对DataSet进行预处理\n", + " (2.1) 处理过程中将某些field设置为input,某些field设置为target\n", + "### 3. 构建模型\n", + " (3.1) 构建模型过程中,需要注意forward函数的形参名需要和DataSet中设置为input的field名称是一致的。\n", + " 例如:\n", + " DataSet中x,seq_lens是input,那么forward就应该是\n", + " def forward(self, x, seq_lens):\n", + " pass\n", + " 我们是通过形参名称进行匹配的field的\n", + " (3.2) 模型的forward的output需要是dict类型的。\n", + " 建议将输出设置为{\"pred\": xx}.\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/fastnlp_1_minute_tutorial.ipynb b/tutorials/fastnlp_1_minute_tutorial.ipynb new file mode 100644 index 00000000..e584a405 --- /dev/null +++ b/tutorials/fastnlp_1_minute_tutorial.ipynb @@ -0,0 +1,333 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# FastNLP 1分钟上手教程" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## step 1\n", + "读取数据集" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "from fastNLP import DataSet\n", + "# linux_path = \"../test/data_for_tests/tutorial_sample_dataset.csv\"\n", + "win_path = \"C:\\\\Users\\zyfeng\\Desktop\\FudanNLP\\\\fastNLP\\\\test\\\\data_for_tests\\\\tutorial_sample_dataset.csv\"\n", + "ds = DataSet.read_csv(win_path, headers=('raw_sentence', 'label'), sep='\\t')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## step 2\n", + "数据预处理\n", + "1. 类型转换\n", + "2. 切分验证集\n", + "3. 构建词典" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "# 将所有数字转为小写\n", + "ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')\n", + "# label转int\n", + "ds.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True)\n", + "\n", + "def split_sent(ins):\n", + " return ins['raw_sentence'].split()\n", + "ds.apply(split_sent, new_field_name='words', is_input=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train size: " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "54" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test size: " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "23" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# 分割训练集/验证集\n", + "train_data, dev_data = ds.split(0.3)\n", + "print(\"Train size: \", len(train_data))\n", + "print(\"Test size: \", len(dev_data))" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "from fastNLP import Vocabulary\n", + "vocab = Vocabulary(min_freq=2)\n", + "train_data.apply(lambda x: [vocab.add(word) for word in x['words']])\n", + "\n", + "# index句子, Vocabulary.to_index(word)\n", + "train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True)\n", + "dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## step 3\n", + " 定义模型" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "from fastNLP.models import CNNText\n", + "model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## step 4\n", + "开始训练" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "training epochs started 2018-12-07 14:03:41" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=6), HTML(value='')), layout=Layout(display='i…" + ] + }, + "execution_count": 0, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/3. Step:2/6. AccuracyMetric: acc=0.26087" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 2/3. Step:4/6. AccuracyMetric: acc=0.347826" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 3/3. Step:6/6. AccuracyMetric: acc=0.608696" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train finished!" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric\n", + "trainer = Trainer(model=model, \n", + " train_data=train_data, \n", + " dev_data=dev_data,\n", + " loss=CrossEntropyLoss(),\n", + " metrics=AccuracyMetric()\n", + " )\n", + "trainer.train()\n", + "print('Train finished!')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 本教程结束。更多操作请参考进阶教程。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tutorials/fastnlp_in_six_lines.ipynb b/tutorials/fastnlp_advanced_tutorial.ipynb similarity index 53% rename from tutorials/fastnlp_in_six_lines.ipynb rename to tutorials/fastnlp_advanced_tutorial.ipynb index 2d8f40d7..c1322ab8 100644 --- a/tutorials/fastnlp_in_six_lines.ipynb +++ b/tutorials/fastnlp_advanced_tutorial.ipynb @@ -6,48 +6,68 @@ "collapsed": true }, "source": [ - "# 六行代码搞定FastNLP" + "## FastNLP 进阶教程\n", + "本教程阅读时间平均30分钟" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from fastNLP.core.dataset import DataSet\n", - "import fastNLP.io.dataset_loader" + "## 数据部分\n", + "### DataSet\n" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "ds = DataSet.read_naive(\"../test/data_for_tests/tutorial_sample_dataset.csv\")" + "### Instance" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "### Vocabulary" + ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "## 模型部分\n", + "### model" + ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "## 训练测试部分\n", + "### Loss" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Metric" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Trainer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tester" + ] }, { "cell_type": "code", diff --git a/tutorials/fastnlp_tutorial_1203.ipynb b/tutorials/fastnlp_tutorial_1203.ipynb deleted file mode 100644 index cb8fa6a0..00000000 --- a/tutorials/fastnlp_tutorial_1203.ipynb +++ /dev/null @@ -1,526 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "fastNLP上手教程\n", - "-------\n", - "\n", - "fastNLP提供方便的数据预处理,训练和测试模型的功能" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/yh/miniconda2/envs/python3/lib/python3.6/site-packages/tqdm/autonotebook/__init__.py:14: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", - " \" (e.g. in jupyter console)\", TqdmExperimentalWarning)\n" - ] - } - ], - "source": [ - "import sys\n", - "sys.path.append('/Users/yh/Desktop/fastNLP/fastNLP/')\n", - "\n", - "import fastNLP as fnlp" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "DataSet & Instance\n", - "------\n", - "\n", - "fastNLP用DataSet和Instance保存和处理数据。每个DataSet表示一个数据集,每个Instance表示一个数据样本。一个DataSet存有多个Instance,每个Instance可以自定义存哪些内容。\n", - "\n", - "有一些read_*方法,可以轻松从文件读取数据,存成DataSet。" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .,\n", - "'label': 1}\n" - ] - } - ], - "source": [ - "from fastNLP import DataSet\n", - "from fastNLP import Instance\n", - "\n", - "# 从csv读取数据到DataSet\n", - "dataset = DataSet.read_csv('sentence.csv', headers=('raw_sentence', 'label'), sep='\\t')\n", - "print(dataset[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': fake data,\n", - "'label': 0}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# DataSet.append(Instance)加入新数据\n", - "\n", - "dataset.append(Instance(raw_sentence='fake data', label='0'))\n", - "dataset[-1]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# DataSet.apply(func, new_field_name)对数据预处理\n", - "\n", - "# 将所有数字转为小写\n", - "dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')\n", - "# label转int\n", - "dataset.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True)\n", - "# 使用空格分割句子\n", - "dataset.drop(lambda x:len(x['raw_sentence'].split())==0)\n", - "def split_sent(ins):\n", - " return ins['raw_sentence'].split()\n", - "dataset.apply(split_sent, new_field_name='words', is_input=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# DataSet.drop(func)筛除数据\n", - "# 删除低于某个长度的词语\n", - "# dataset.drop(lambda x: len(x['words']) <= 3)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train size: 5971\n", - "Test size: 2558\n" - ] - } - ], - "source": [ - "# 分出测试集、训练集\n", - "\n", - "test_data, train_data = dataset.split(0.3)\n", - "print(\"Train size: \", len(test_data))\n", - "print(\"Test size: \", len(train_data))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Vocabulary\n", - "------\n", - "\n", - "fastNLP中的Vocabulary轻松构建词表,将词转成数字" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'raw_sentence': gussied up with so many distracting special effects and visual party tricks that it 's not clear whether we 're supposed to shriek or laugh .,\n", - "'label': 1,\n", - "'label_seq': 1,\n", - "'words': ['gussied', 'up', 'with', 'so', 'many', 'distracting', 'special', 'effects', 'and', 'visual', 'party', 'tricks', 'that', 'it', \"'s\", 'not', 'clear', 'whether', 'we', \"'re\", 'supposed', 'to', 'shriek', 'or', 'laugh', '.'],\n", - "'word_seq': [1, 65, 16, 43, 108, 1, 329, 433, 7, 319, 1313, 1, 12, 10, 11, 27, 1428, 567, 86, 134, 1949, 8, 1, 49, 506, 2]}\n" - ] - } - ], - "source": [ - "from fastNLP import Vocabulary\n", - "\n", - "# 构建词表, Vocabulary.add(word)\n", - "vocab = Vocabulary(min_freq=2)\n", - "train_data.apply(lambda x: [vocab.add(word) for word in x['words']])\n", - "vocab.build_vocab()\n", - "\n", - "# index句子, Vocabulary.to_index(word)\n", - "train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True)\n", - "test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True)\n", - "\n", - "\n", - "print(test_data[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "batch_x has: {'words': array([list(['this', 'kind', 'of', 'hands-on', 'storytelling', 'is', 'ultimately', 'what', 'makes', 'shanghai', 'ghetto', 'move', 'beyond', 'a', 'good', ',', 'dry', ',', 'reliable', 'textbook', 'and', 'what', 'allows', 'it', 'to', 'rank', 'with', 'its', 'worthy', 'predecessors', '.']),\n", - " list(['the', 'entire', 'movie', 'is', 'filled', 'with', 'deja', 'vu', 'moments', '.'])],\n", - " dtype=object), 'word_seq': tensor([[ 19, 184, 6, 1, 481, 9, 206, 50, 91, 1210, 1609, 1330,\n", - " 495, 5, 63, 4, 1269, 4, 1, 1184, 7, 50, 1050, 10,\n", - " 8, 1611, 16, 21, 1039, 1, 2],\n", - " [ 3, 711, 22, 9, 1282, 16, 2482, 2483, 200, 2, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0]])}\n", - "batch_y has: {'label_seq': tensor([3, 2])}\n" - ] - } - ], - "source": [ - "# 假设你们需要做强化学习或者gan之类的项目,也许你们可以使用这里的dataset\n", - "from fastNLP.core.batch import Batch\n", - "from fastNLP.core.sampler import RandomSampler\n", - "\n", - "batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler())\n", - "for batch_x, batch_y in batch_iterator:\n", - " print(\"batch_x has: \", batch_x)\n", - " print(\"batch_y has: \", batch_y)\n", - " break" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Model\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "CNNText(\n", - " (embed): Embedding(\n", - " (embed): Embedding(3470, 50, padding_idx=0)\n", - " (dropout): Dropout(p=0.0)\n", - " )\n", - " (conv_pool): ConvMaxpool(\n", - " (convs): ModuleList(\n", - " (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n", - " (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n", - " (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n", - " )\n", - " )\n", - " (dropout): Dropout(p=0.1)\n", - " (fc): Linear(\n", - " (linear): Linear(in_features=12, out_features=5, bias=True)\n", - " )\n", - ")" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 定义一个简单的Pytorch模型\n", - "\n", - "from fastNLP.models import CNNText\n", - "model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)\n", - "model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Trainer & Tester\n", - "------\n", - "\n", - "使用fastNLP的Trainer训练模型" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from fastNLP import Trainer\n", - "from copy import deepcopy\n", - "from fastNLP.core.losses import CrossEntropyLoss\n", - "from fastNLP.core.metrics import AccuracyMetric" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "training epochs started 2018-12-05 15:37:15\n" - ] - }, - { - "data": { - "text/plain": [ - "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=1870), HTML(value='')), layout=Layout(display…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1/10. Step:187/1870. AccuracyMetric: acc=0.351365\n", - "Epoch 2/10. Step:374/1870. AccuracyMetric: acc=0.470943\n", - "Epoch 3/10. Step:561/1870. AccuracyMetric: acc=0.600402\n", - "Epoch 4/10. Step:748/1870. AccuracyMetric: acc=0.702227\n", - "Epoch 5/10. Step:935/1870. AccuracyMetric: acc=0.79099\n", - "Epoch 6/10. Step:1122/1870. AccuracyMetric: acc=0.846424\n", - "Epoch 7/10. Step:1309/1870. AccuracyMetric: acc=0.874058\n", - "Epoch 8/10. Step:1496/1870. AccuracyMetric: acc=0.898844\n", - "Epoch 9/10. Step:1683/1870. AccuracyMetric: acc=0.910568\n", - "Epoch 10/10. Step:1870/1870. AccuracyMetric: acc=0.921286\n", - "\r" - ] - } - ], - "source": [ - "# 进行overfitting测试\n", - "copy_model = deepcopy(model)\n", - "overfit_trainer = Trainer(model=copy_model, \n", - " train_data=test_data, \n", - " dev_data=test_data,\n", - " losser=CrossEntropyLoss(pred=\"output\", target=\"label_seq\"),\n", - " metrics=AccuracyMetric(),\n", - " n_epochs=10,\n", - " save_path=None)\n", - "overfit_trainer.train()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "training epochs started 2018-12-05 15:37:41\n" - ] - }, - { - "data": { - "text/plain": [ - "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=400), HTML(value='')), layout=Layout(display=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r" - ] - }, - { - "ename": "AttributeError", - "evalue": "'NoneType' object has no attribute 'squeeze'", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mn_epochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m save_path='save/')\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Train finished!'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/fastNLP/fastNLP/fastNLP/core/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_summary_writer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSummaryWriter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_tqdm\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 165\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tqdm_train\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 166\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_print_train\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/fastNLP/fastNLP/fastNLP/core/trainer.py\u001b[0m in \u001b[0;36m_tqdm_train\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[0mpbar\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0meval_str\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalidate_every\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdev_data\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 208\u001b[0;31m \u001b[0meval_res\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_do_validation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 209\u001b[0m \u001b[0meval_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Epoch {}/{}. Step:{}/{}. \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mepoch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_epochs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtotal_steps\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtester\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_format_eval_results\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0meval_res\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/fastNLP/fastNLP/fastNLP/core/trainer.py\u001b[0m in \u001b[0;36m_do_validation\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtester\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 266\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 267\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_summary_writer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"valid_{}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mglobal_step\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 268\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_path\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_better_eval_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 269\u001b[0m \u001b[0mmetric_key\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetric_key\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetric_key\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m\"None\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda2/envs/python3/lib/python3.6/site-packages/tensorboardX/writer.py\u001b[0m in \u001b[0;36madd_scalar\u001b[0;34m(self, tag, scalar_value, global_step, walltime)\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_caffe2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscalar_value\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[0mscalar_value\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mworkspace\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFetchBlob\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscalar_value\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 334\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfile_writer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_summary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtag\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscalar_value\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mglobal_step\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwalltime\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 335\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0madd_scalars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmain_tag\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtag_scalar_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mglobal_step\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwalltime\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda2/envs/python3/lib/python3.6/site-packages/tensorboardX/summary.py\u001b[0m in \u001b[0;36mscalar\u001b[0;34m(name, scalar, collections)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[0mname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_clean_tag\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[0mscalar\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_np\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscalar\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 117\u001b[0;31m \u001b[0;32massert\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscalar\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msqueeze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'scalar should be 0D'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 118\u001b[0m \u001b[0mscalar\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscalar\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mSummary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mSummary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mValue\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtag\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msimple_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscalar\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'squeeze'" - ], - "output_type": "error" - } - ], - "source": [ - "# 实例化Trainer,传入模型和数据,进行训练\n", - "trainer = Trainer(model=model, \n", - " train_data=train_data, \n", - " dev_data=test_data,\n", - " losser=CrossEntropyLoss(pred=\"output\", target=\"label_seq\"),\n", - " metrics=AccuracyMetric(),\n", - " n_epochs=5,\n", - " save_path='save/')\n", - "trainer.train()\n", - "print('Train finished!')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from fastNLP import Tester\n", - "\n", - "tester = Tester(data=test_data, model=model, metrics=AccuracyMetric())\n", - "acc = tester.test()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# In summary\n", - "\n", - "## fastNLP Trainer的伪代码逻辑\n", - "### 1. 准备DataSet,假设DataSet中共有如下的fields\n", - " ['raw_sentence', 'word_seq1', 'word_seq2', 'raw_label','label']\n", - " 通过\n", - " DataSet.set_input('word_seq1', word_seq2', flag=True)将'word_seq1', 'word_seq2'设置为input\n", - " 通过\n", - " DataSet.set_target('label', flag=True)将'label'设置为target\n", - "### 2. 初始化模型\n", - " class Model(nn.Module):\n", - " def __init__(self):\n", - " xxx\n", - " def forward(self, word_seq1, word_seq2):\n", - " # (1) 这里使用的形参名必须和DataSet中的input field的名称对应。因为我们是通过形参名, 进行赋值的\n", - " # (2) input field的数量可以多于这里的形参数量。但是不能少于。\n", - " xxxx\n", - " # 输出必须是一个dict\n", - "### 3. Trainer的训练过程\n", - " (1) 从DataSet中按照batch_size取出一个batch,调用Model.forward\n", - " (2) 将 Model.forward的结果 与 标记为target的field 传入Losser当中。\n", - " 由于每个人写的Model.forward的output的dict可能key并不一样,比如有人是{'pred':xxx}, {'output': xxx}; \n", - " 另外每个人将target可能也会设置为不同的名称, 比如有人是label, 有人设置为target;\n", - " 为了解决以上的问题,我们的loss提供映射机制\n", - " 比如CrossEntropyLosser的需要的输入是(prediction, target)。但是forward的output是{'output': xxx}; 'label'是target\n", - " 那么初始化losser的时候写为CrossEntropyLosser(prediction='output', target='label')即可\n", - " (3) 对于Metric是同理的\n", - " Metric计算也是从 forward的结果中取值 与 设置target的field中取值。 也是可以通过映射找到对应的值 \n", - " \n", - " \n", - "\n", - "## 一些问题.\n", - "### 1. DataSet中为什么需要设置input和target\n", - " 只有被设置为input或者target的数据才会在train的过程中被取出来\n", - " (1.1) 我们只会在设置为input的field中寻找传递给Model.forward的参数。\n", - " (1.2) 我们在传递值给losser或者metric的时候会使用来自: \n", - " (a)Model.forward的output\n", - " (b)被设置为target的field\n", - " \n", - "\n", - "### 2. 我们是通过forwad中的形参名将DataSet中的field赋值给对应的参数\n", - " (1.1) 构建模型过程中,\n", - " 例如:\n", - " DataSet中x,seq_lens是input,那么forward就应该是\n", - " def forward(self, x, seq_lens):\n", - " pass\n", - " 我们是通过形参名称进行匹配的field的\n", - " \n", - "\n", - "\n", - "### 1. 加载数据到DataSet\n", - "### 2. 使用apply操作对DataSet进行预处理\n", - " (2.1) 处理过程中将某些field设置为input,某些field设置为target\n", - "### 3. 构建模型\n", - " (3.1) 构建模型过程中,需要注意forward函数的形参名需要和DataSet中设置为input的field名称是一致的。\n", - " 例如:\n", - " DataSet中x,seq_lens是input,那么forward就应该是\n", - " def forward(self, x, seq_lens):\n", - " pass\n", - " 我们是通过形参名称进行匹配的field的\n", - " (3.2) 模型的forward的output需要是dict类型的。\n", - " 建议将输出设置为{\"pred\": xx}.\n", - " \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tutorials/fastnlp_tutorial_1204.ipynb b/tutorials/fastnlp_tutorial_1204.ipynb deleted file mode 100644 index 8d896bf2..00000000 --- a/tutorials/fastnlp_tutorial_1204.ipynb +++ /dev/null @@ -1,447 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "fastNLP上手教程\n", - "-------\n", - "\n", - "fastNLP提供方便的数据预处理,训练和测试模型的功能" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.append('/Users/yh/Desktop/fastNLP/fastNLP/')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "DataSet & Instance\n", - "------\n", - "\n", - "fastNLP用DataSet和Instance保存和处理数据。每个DataSet表示一个数据集,每个Instance表示一个数据样本。一个DataSet存有多个Instance,每个Instance可以自定义存哪些内容。\n", - "\n", - "有一些read_*方法,可以轻松从文件读取数据,存成DataSet。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from fastNLP import DataSet\n", - "from fastNLP import Instance\n", - "\n", - "# 从csv读取数据到DataSet\n", - "dataset = DataSet.read_csv('../sentence.csv', headers=('raw_sentence', 'label'), sep='\\t')\n", - "print(len(dataset))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 使用数字索引[k],获取第k个样本\n", - "print(dataset[0])\n", - "\n", - "# 索引也可以是负数\n", - "print(dataset[-3])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Instance\n", - "Instance表示一个样本,由一个或多个field(域,属性,特征)组成,每个field有名字和值。\n", - "\n", - "在初始化Instance时即可定义它包含的域,使用 \"field_name=field_value\"的写法。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# DataSet.append(Instance)加入新数据\n", - "dataset.append(Instance(raw_sentence='fake data', label='0'))\n", - "dataset[-1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## DataSet.apply方法\n", - "数据预处理利器" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 将所有数字转为小写\n", - "dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')\n", - "print(dataset[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# label转int\n", - "dataset.apply(lambda x: int(x['label']), new_field_name='label')\n", - "print(dataset[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 使用空格分割句子\n", - "def split_sent(ins):\n", - " return ins['raw_sentence'].split()\n", - "dataset.apply(split_sent, new_field_name='words')\n", - "print(dataset[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 增加长度信息\n", - "dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')\n", - "print(dataset[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## DataSet.drop\n", - "筛选数据" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset.drop(lambda x: x['seq_len'] <= 3)\n", - "print(len(dataset))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 配置DataSet\n", - "1. 哪些域是特征,哪些域是标签\n", - "2. 切分训练集/验证集" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 设置DataSet中,哪些field要转为tensor\n", - "\n", - "# set target,loss或evaluate中的golden,计算loss,模型评估时使用\n", - "dataset.set_target(\"label\")\n", - "# set input,模型forward时使用\n", - "dataset.set_input(\"words\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 分出测试集、训练集\n", - "\n", - "test_data, train_data = dataset.split(0.3)\n", - "print(len(test_data))\n", - "print(len(train_data))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Vocabulary\n", - "------\n", - "\n", - "fastNLP中的Vocabulary轻松构建词表,将词转成数字" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from fastNLP import Vocabulary\n", - "\n", - "# 构建词表, Vocabulary.add(word)\n", - "vocab = Vocabulary(min_freq=2)\n", - "train_data.apply(lambda x: [vocab.add(word) for word in x['words']])\n", - "vocab.build_vocab()\n", - "\n", - "# index句子, Vocabulary.to_index(word)\n", - "train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')\n", - "test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')\n", - "\n", - "\n", - "print(test_data[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Model\n", - "定义一个PyTorch模型" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from fastNLP.models import CNNText\n", - "model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)\n", - "model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "这是上述模型的forward方法。如果你不知道什么是forward方法,请参考我们的PyTorch教程。\n", - "\n", - "注意两点:\n", - "1. forward参数名字叫**word_seq**,请记住。\n", - "2. forward的返回值是一个**dict**,其中有个key的名字叫**output**。\n", - "\n", - "```Python\n", - " def forward(self, word_seq):\n", - " \"\"\"\n", - "\n", - " :param word_seq: torch.LongTensor, [batch_size, seq_len]\n", - " :return output: dict of torch.LongTensor, [batch_size, num_classes]\n", - " \"\"\"\n", - " x = self.embed(word_seq) # [N,L] -> [N,L,C]\n", - " x = self.conv_pool(x) # [N,L,C] -> [N,C]\n", - " x = self.dropout(x)\n", - " x = self.fc(x) # [N,C] -> [N, N_class]\n", - " return {'output': x}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "这是上述模型的predict方法,是用来直接输出该任务的预测结果,与forward目的不同。\n", - "\n", - "注意两点:\n", - "1. predict参数名也叫**word_seq**。\n", - "2. predict的返回值是也一个**dict**,其中有个key的名字叫**predict**。\n", - "\n", - "```\n", - " def predict(self, word_seq):\n", - " \"\"\"\n", - "\n", - " :param word_seq: torch.LongTensor, [batch_size, seq_len]\n", - " :return predict: dict of torch.LongTensor, [batch_size, seq_len]\n", - " \"\"\"\n", - " output = self(word_seq)\n", - " _, predict = output['output'].max(dim=1)\n", - " return {'predict': predict}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Trainer & Tester\n", - "------\n", - "\n", - "使用fastNLP的Trainer训练模型" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from fastNLP import Trainer\n", - "from copy import deepcopy\n", - "from fastNLP.core.losses import CrossEntropyLoss\n", - "from fastNLP.core.metrics import AccuracyMetric\n", - "\n", - "\n", - "# 更改DataSet中对应field的名称,与模型的forward的参数名一致\n", - "# 因为forward的参数叫word_seq, 所以要把原本叫words的field改名为word_seq\n", - "# 这里的演示是让你了解这种**命名规则**\n", - "train_data.rename_field('words', 'word_seq')\n", - "test_data.rename_field('words', 'word_seq')\n", - "\n", - "# 顺便把label换名为label_seq\n", - "train_data.rename_field('label', 'label_seq')\n", - "test_data.rename_field('label', 'label_seq')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### loss\n", - "训练模型需要提供一个损失函数\n", - "\n", - "下面提供了一个在分类问题中常用的交叉熵损失。注意它的**初始化参数**。\n", - "\n", - "pred参数对应的是模型的forward返回的dict的一个key的名字,这里是\"output\"。\n", - "\n", - "target参数对应的是dataset作为标签的field的名字,这里是\"label_seq\"。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "loss = CrossEntropyLoss(pred=\"output\", target=\"label_seq\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Metric\n", - "定义评价指标\n", - "\n", - "这里使用准确率。参数的“命名规则”跟上面类似。\n", - "\n", - "pred参数对应的是模型的predict方法返回的dict的一个key的名字,这里是\"predict\"。\n", - "\n", - "target参数对应的是dataset作为标签的field的名字,这里是\"label_seq\"。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "metric = AccuracyMetric(pred=\"predict\", target=\"label_seq\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 实例化Trainer,传入模型和数据,进行训练\n", - "# 先在test_data拟合\n", - "copy_model = deepcopy(model)\n", - "overfit_trainer = Trainer(model=copy_model, train_data=test_data, dev_data=test_data,\n", - " losser=loss,\n", - " metrics=metric,\n", - " save_path=None,\n", - " batch_size=32,\n", - " n_epochs=5)\n", - "overfit_trainer.train()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 用train_data训练,在test_data验证\n", - "trainer = Trainer(model=model, train_data=train_data, dev_data=test_data,\n", - " losser=CrossEntropyLoss(pred=\"output\", target=\"label_seq\"),\n", - " metrics=AccuracyMetric(pred=\"predict\", target=\"label_seq\"),\n", - " save_path=None,\n", - " batch_size=32,\n", - " n_epochs=5)\n", - "trainer.train()\n", - "print('Train finished!')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 调用Tester在test_data上评价效果\n", - "from fastNLP import Tester\n", - "\n", - "tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred=\"predict\", target=\"label_seq\"),\n", - " batch_size=4)\n", - "acc = tester.test()\n", - "print(acc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 267baec2244b1812fa3bdb01a66b7c05986352c2 Mon Sep 17 00:00:00 2001 From: yunfan Date: Fri, 7 Dec 2018 15:19:56 +0800 Subject: [PATCH 66/67] add dataloader register --- fastNLP/core/__init__.py | 6 ++- fastNLP/core/dataset.py | 34 +++++++++----- fastNLP/core/trainer.py | 8 ++-- fastNLP/io/base_loader.py | 36 +++++++++++++++ fastNLP/io/dataset_loader.py | 89 ++++++++++++++++++++++++++++++++---- 5 files changed, 147 insertions(+), 26 deletions(-) diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index 44f30fad..038ca12f 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -1,5 +1,5 @@ from .batch import Batch -from .dataset import DataSet +# from .dataset import DataSet from .fieldarray import FieldArray from .instance import Instance from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward @@ -8,4 +8,6 @@ from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler from .tester import Tester from .trainer import Trainer -from .vocabulary import Vocabulary \ No newline at end of file +from .vocabulary import Vocabulary +from ..io.dataset_loader import DataSet + diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index d4d285d7..a08961fc 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -5,8 +5,7 @@ from fastNLP.core.fieldarray import FieldArray from fastNLP.core.instance import Instance from fastNLP.core.utils import get_func_signature - -_READERS = {} +from fastNLP.io.base_loader import DataLoaderRegister class DataSet(object): @@ -98,6 +97,24 @@ def __getitem__(self, idx): else: raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx))) + def __getattr__(self, item): + if item == "field_arrays": + raise AttributeError + # TODO dataset.x + if item in self.field_arrays: + return self.field_arrays[item] + try: + reader = DataLoaderRegister.get_reader(item) + return reader + except AttributeError: + raise + + def __setstate__(self, state): + self.__dict__ = state + + def __getstate__(self): + return self.__dict__ + def __len__(self): """Fetch the length of the dataset. @@ -226,16 +243,6 @@ def get_target_name(self): """ return [name for name, field in self.field_arrays.items() if field.is_target] - @classmethod - def set_reader(cls, method_name): - assert isinstance(method_name, str) - - def wrapper(read_cls): - _READERS[method_name] = read_cls - return read_cls - - return wrapper - def apply(self, func, new_field_name=None, **kwargs): """Apply a function to every instance of the DataSet. @@ -347,6 +354,9 @@ def read_csv(cls, csv_path, headers=None, sep=",", dropna=True): _dict[header].append(content) return cls(_dict) + # def read_pos(self): + # return DataLoaderRegister.get_reader('read_pos') + def save(self, path): """Save the DataSet object as pickle. diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index c2bca3a2..6cb6b560 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -85,8 +85,8 @@ def __init__(self, train_data, model, loss=None, metrics=None, n_epochs=3, batch if metric_key is not None: self.increase_better = False if metric_key[0] == "-" else True self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key - else: - self.metric_key = None + elif metrics is not None: + self.metric_key = metrics[0].__class__.__name__.lower().strip('metric') # prepare loss losser = _prepare_losser(loss) @@ -147,7 +147,7 @@ def train(self): self._mode(self.model, is_test=False) - self.start_time = str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + self.start_time = str(datetime.now().strftime('%Y-%m-%d %H-%M-%S')) print("training epochs started " + self.start_time, flush=True) if self.save_path is None: class psudoSW: @@ -260,7 +260,7 @@ def _do_validation(self): self._summary_writer.add_scalar("valid_{}_{}".format(name, metric_key), metric_val, global_step=self.step) if self.save_path is not None and self._better_eval_result(res): - metric_key = self.metric_key if self.metric_key is not None else "None" + metric_key = self.metric_key if self.metric_key is not None else "" self._save_model(self.model, "best_" + "_".join([self.model.__class__.__name__, metric_key, self.start_time])) return res diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py index b0b0d864..a3ce410b 100644 --- a/fastNLP/io/base_loader.py +++ b/fastNLP/io/base_loader.py @@ -29,3 +29,39 @@ def load_with_cache(cls, data_path, cache_path): with open(cache_path, 'wb') as f: pickle.dump(obj, f) return obj + + +class ToyLoader0(BaseLoader): + """ + For CharLM + """ + + def __init__(self, data_path): + super(ToyLoader0, self).__init__(data_path) + + def load(self): + with open(self.data_path, 'r') as f: + corpus = f.read().lower() + import re + corpus = re.sub(r"", "unk", corpus) + return corpus.split() + + +class DataLoaderRegister: + """"register for data sets""" + _readers = {} + + @classmethod + def set_reader(cls, reader_cls, read_fn_name): + # def wrapper(reader_cls): + if read_fn_name in cls._readers: + raise KeyError('duplicate reader: {} and {} for read_func: {}'.format(cls._readers[read_fn_name], reader_cls, read_fn_name)) + if hasattr(reader_cls, 'load'): + cls._readers[read_fn_name] = reader_cls().load + return reader_cls + + @classmethod + def get_reader(cls, read_fn_name): + if read_fn_name in cls._readers: + return cls._readers[read_fn_name] + raise AttributeError('no read function: {}'.format(read_fn_name)) diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index 0d30c6e8..a1cfe33f 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -2,7 +2,7 @@ from fastNLP.core.dataset import DataSet from fastNLP.core.instance import Instance -from fastNLP.io.base_loader import BaseLoader +from fastNLP.io.base_loader import DataLoaderRegister def convert_seq_dataset(data): @@ -61,12 +61,9 @@ def convert_seq2seq_dataset(data): return dataset -class DataSetLoader(BaseLoader): +class DataSetLoader: """"loader for data sets""" - def __init__(self): - super(DataSetLoader, self).__init__() - def load(self, path): """ load data in `path` into a dataset """ @@ -104,9 +101,9 @@ def load(self, data_path, split=None): def convert(self, data): return convert_seq_dataset(data) +DataLoaderRegister.set_reader(RawDataSetLoader, 'read_rawdata') -@DataSet.set_reader('read_pos') class POSDataSetLoader(DataSetLoader): """Dataset Loader for POS Tag datasets. @@ -174,9 +171,9 @@ def convert(self, data): """Convert lists of strings into Instances with Fields. """ return convert_seq2seq_dataset(data) +DataLoaderRegister.set_reader(POSDataSetLoader, 'read_pos') -@DataSet.set_reader('read_tokenize') class TokenizeDataSetLoader(DataSetLoader): """ Data set loader for tokenization data sets @@ -236,7 +233,6 @@ def convert(self, data): return convert_seq2seq_dataset(data) -@DataSet.set_reader('read_class') class ClassDataSetLoader(DataSetLoader): """Loader for classification data sets""" @@ -275,6 +271,83 @@ def convert(self, data): return convert_seq2tag_dataset(data) +class ConllLoader(DataSetLoader): + """loader for conll format files""" + + def __init__(self): + """ + :param str data_path: the path to the conll data set + """ + super(ConllLoader, self).__init__() + + def load(self, data_path): + """ + :return: list lines: all lines in a conll file + """ + with open(data_path, "r", encoding="utf-8") as f: + lines = f.readlines() + data = self.parse(lines) + return self.convert(data) + + @staticmethod + def parse(lines): + """ + :param list lines:a list containing all lines in a conll file. + :return: a 3D list + """ + sentences = list() + tokens = list() + for line in lines: + if line[0] == "#": + # skip the comments + continue + if line == "\n": + sentences.append(tokens) + tokens = [] + continue + tokens.append(line.split()) + return sentences + + def convert(self, data): + pass + + +class LMDataSetLoader(DataSetLoader): + """Language Model Dataset Loader + + This loader produces data for language model training in a supervised way. + That means it has X and Y. + + """ + + def __init__(self): + super(LMDataSetLoader, self).__init__() + + def load(self, data_path): + if not os.path.exists(data_path): + raise FileNotFoundError("file {} not found.".format(data_path)) + with open(data_path, "r", encoding="utf=8") as f: + text = " ".join(f.readlines()) + tokens = text.strip().split() + data = self.sentence_cut(tokens) + return self.convert(data) + + def sentence_cut(self, tokens, sentence_length=15): + start_idx = 0 + data_set = [] + for idx in range(len(tokens) // sentence_length): + x = tokens[start_idx * idx: start_idx * idx + sentence_length] + y = tokens[start_idx * idx + 1: start_idx * idx + sentence_length + 1] + if start_idx * idx + sentence_length + 1 >= len(tokens): + # ad hoc + y.extend([""]) + data_set.append([x, y]) + return data_set + + def convert(self, data): + pass + + @DataSet.set_reader('read_people_daily') class PeopleDailyCorpusLoader(DataSetLoader): """ From db0a789d619c0e47564c89c910ba1db9e26a49c1 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Fri, 7 Dec 2018 19:09:50 +0800 Subject: [PATCH 67/67] * final clean up * remove conflicts * all tests passed --- fastNLP/core/dataset.py | 4 ++-- fastNLP/core/trainer.py | 2 +- fastNLP/io/base_loader.py | 16 ---------------- fastNLP/io/dataset_loader.py | 10 +++++++--- test/core/test_dataset.py | 14 ++++++++++++++ test/core/test_optimizer.py | 18 ++++++++++++++---- 6 files changed, 38 insertions(+), 26 deletions(-) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index a08961fc..52dac2fc 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -98,10 +98,10 @@ def __getitem__(self, idx): raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx))) def __getattr__(self, item): + # Not tested. Don't use !! if item == "field_arrays": raise AttributeError - # TODO dataset.x - if item in self.field_arrays: + if isinstance(item, str) and item in self.field_arrays: return self.field_arrays[item] try: reader = DataLoaderRegister.get_reader(item) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 6cb6b560..5997ebbc 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -85,7 +85,7 @@ def __init__(self, train_data, model, loss=None, metrics=None, n_epochs=3, batch if metric_key is not None: self.increase_better = False if metric_key[0] == "-" else True self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key - elif metrics is not None: + elif len(metrics) > 0: self.metric_key = metrics[0].__class__.__name__.lower().strip('metric') # prepare loss diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py index a3ce410b..b01c233a 100644 --- a/fastNLP/io/base_loader.py +++ b/fastNLP/io/base_loader.py @@ -31,22 +31,6 @@ def load_with_cache(cls, data_path, cache_path): return obj -class ToyLoader0(BaseLoader): - """ - For CharLM - """ - - def __init__(self, data_path): - super(ToyLoader0, self).__init__(data_path) - - def load(self): - with open(self.data_path, 'r') as f: - corpus = f.read().lower() - import re - corpus = re.sub(r"", "unk", corpus) - return corpus.split() - - class DataLoaderRegister: """"register for data sets""" _readers = {} diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index a1cfe33f..641a631e 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -75,7 +75,6 @@ def convert(self, data): raise NotImplementedError -@DataSet.set_reader("read_naive") class NativeDataSetLoader(DataSetLoader): def __init__(self): super(NativeDataSetLoader, self).__init__() @@ -87,7 +86,9 @@ def load(self, path): return ds -@DataSet.set_reader('read_raw') +DataLoaderRegister.set_reader(NativeDataSetLoader, 'read_naive') + + class RawDataSetLoader(DataSetLoader): def __init__(self): super(RawDataSetLoader, self).__init__() @@ -101,6 +102,8 @@ def load(self, data_path, split=None): def convert(self, data): return convert_seq_dataset(data) + + DataLoaderRegister.set_reader(RawDataSetLoader, 'read_rawdata') @@ -171,6 +174,8 @@ def convert(self, data): """Convert lists of strings into Instances with Fields. """ return convert_seq2seq_dataset(data) + + DataLoaderRegister.set_reader(POSDataSetLoader, 'read_pos') @@ -348,7 +353,6 @@ def convert(self, data): pass -@DataSet.set_reader('read_people_daily') class PeopleDailyCorpusLoader(DataSetLoader): """ People Daily Corpus: Chinese word segmentation, POS tag, NER diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index 74ad5958..01963af6 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -178,6 +178,20 @@ def test_get_field(self): self.assertTrue(isinstance(ans, FieldArray)) self.assertEqual(ans.content, [[5, 6]] * 10) + def test_reader(self): + # 跑通即可 + ds = DataSet().read_naive("test/data_for_tests/tutorial_sample_dataset.csv") + self.assertTrue(isinstance(ds, DataSet)) + self.assertTrue(len(ds) > 0) + + ds = DataSet().read_rawdata("test/data_for_tests/people_daily_raw.txt") + self.assertTrue(isinstance(ds, DataSet)) + self.assertTrue(len(ds) > 0) + + ds = DataSet().read_pos("test/data_for_tests/people.txt") + self.assertTrue(isinstance(ds, DataSet)) + self.assertTrue(len(ds) > 0) + class TestDataSetIter(unittest.TestCase): def test__repr__(self): diff --git a/test/core/test_optimizer.py b/test/core/test_optimizer.py index 8ffa1a72..83ed6000 100644 --- a/test/core/test_optimizer.py +++ b/test/core/test_optimizer.py @@ -7,7 +7,7 @@ class TestOptim(unittest.TestCase): def test_SGD(self): - optim = SGD(torch.nn.Linear(10, 3).parameters()) + optim = SGD(model_params=torch.nn.Linear(10, 3).parameters()) self.assertTrue("lr" in optim.__dict__["settings"]) self.assertTrue("momentum" in optim.__dict__["settings"]) res = optim.construct_from_pytorch(torch.nn.Linear(10, 3).parameters()) @@ -22,13 +22,18 @@ def test_SGD(self): self.assertEqual(optim.__dict__["settings"]["lr"], 0.002) self.assertEqual(optim.__dict__["settings"]["momentum"], 0.989) - with self.assertRaises(RuntimeError): + optim = SGD(0.001) + self.assertEqual(optim.__dict__["settings"]["lr"], 0.001) + res = optim.construct_from_pytorch(torch.nn.Linear(10, 3).parameters()) + self.assertTrue(isinstance(res, torch.optim.SGD)) + + with self.assertRaises(TypeError): _ = SGD("???") - with self.assertRaises(RuntimeError): + with self.assertRaises(TypeError): _ = SGD(0.001, lr=0.002) def test_Adam(self): - optim = Adam(torch.nn.Linear(10, 3).parameters()) + optim = Adam(model_params=torch.nn.Linear(10, 3).parameters()) self.assertTrue("lr" in optim.__dict__["settings"]) self.assertTrue("weight_decay" in optim.__dict__["settings"]) res = optim.construct_from_pytorch(torch.nn.Linear(10, 3).parameters()) @@ -42,3 +47,8 @@ def test_Adam(self): optim = Adam(lr=0.002, weight_decay=0.989) self.assertEqual(optim.__dict__["settings"]["lr"], 0.002) self.assertEqual(optim.__dict__["settings"]["weight_decay"], 0.989) + + optim = Adam(0.001) + self.assertEqual(optim.__dict__["settings"]["lr"], 0.001) + res = optim.construct_from_pytorch(torch.nn.Linear(10, 3).parameters()) + self.assertTrue(isinstance(res, torch.optim.Adam))