From 2bcf35b80ba765e8aee002bb0a19acd2799a5cc5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 30 Jan 2021 13:54:17 -0500 Subject: [PATCH 01/18] BUG: read_excel with openpyxl and missing dimension --- doc/source/whatsnew/v1.2.2.rst | 2 +- pandas/io/excel/_openpyxl.py | 6 ++++++ pandas/tests/io/data/excel/no_dimension.xlsx | Bin 0 -> 4875 bytes pandas/tests/io/excel/test_openpyxl.py | 21 +++++++++++++++++++ 4 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/io/data/excel/no_dimension.xlsx diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index baa0cc2ac9e18..d1418492a44c8 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -26,7 +26,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing dimension information (:issue:`38956`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 71e1bf6b43ad5..56778426f1eef 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -527,4 +527,10 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: for row in sheet.rows: data.append([self._convert_cell(cell, convert_float) for cell in row]) + # openpyxl may not have the correct padding if the dimension tag is + # not specified or is incorrect + max_width = max(len(row) for row in data) + if min(len(row) for row in data) < max_width: + data = [row + (max_width - len(row)) * [""] for row in data] + return data diff --git a/pandas/tests/io/data/excel/no_dimension.xlsx b/pandas/tests/io/data/excel/no_dimension.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..9274896689a72c83138e7ba580c12a66ac59bcf6 GIT binary patch literal 4875 zcmZ`-bzD?y*PQ{$0qKxZ8bn$V>5?8|7+Shx=t~VDjkJi;A)$bT2qGaVAPk*@j5J8d z+#v-);2ZRNulMracg}Cl`D^X-oV}mD*4mGb2JRJV0DzDX(EC^hVqYHqZUF!QNa6qh zAnen^%g)tXP~hTTnL4bFD@-1P364vF`_Gt{ zw_=W?d~bBgurP)HkVjci^gV_4u#%pB0fSbvtDxU^xE3MLlxfXEtY>Zmv^F85?GLHO z7_VGesy9cv*NLP>3ZgQ+ku6ddT>XwL% z`hK0n>=|23;t<&#o-H{N)ugUDgTflux3{7qUskau-yL)?r$R^e=X`%&?&8PZ^z?DpN?wZ4M;^4SzmFv4-LcEX;x|a7rIZ5`*bAANY zhE@aii&EAsyIaqhp#xk%X}+kM^*C{Atz4$C&9}67R@ydt(G{}NtN5Gd(fs-n7*P3^ zPdK#%F`tN9!k~gq`)9hxid$BtvpHV`$w^kZj|*ZIroBXTLdNjqD~Qkxp=Lvqa}SlY zIAY|<>tB2HB-73Y=%-HjoNIs~w)hgK;=IBbOQnI3_xZ}>wql9+qY_M{o!h=5M?wtR z_#3dSv4vEyh<0dd1=D7Dh?>fL;dA#DAOam_0RNMt-Zw1>v3bTU+Gg0bN%zk^LUv;U zzJhI(4LSgT^w%Eo_6cyc^S;<2Xd_eiY$5vK`Rdm{S{24Pz}y9!62fS^@+oN6ZFC0^ z*C1HFemwf@_)`YCW476tz6STM^-1mU_p1ab@vB2q3E`%+d-J@q?QU+SoWH{kb)-28RIx4o)k0g3uw6_|QVl@5pr=TPkb%M{he_p2#+K{MwV1+`y z8HDjsKvi$-*%2;JSBIW*(8Z>faA?0!Bj$LT2~5Jgqe?cH@RZd{PratRAB0G!u%Pq3 zN~MmvClxI~j>kF1puv)j@3p{No~G;BShVL989S|`Uz@s^tTG$ekkmcs;Of0Ho*jLtyCspVU$S8vtt z-a*N~o9>sFY>A-0l7hm(jrjvwi*p$Gd>J_S*^f;prrsj_ACRXg2j z*|=cuYRX*g05?+^bKI0@Iqr-Msk-M%dm!eIZ_glKDz7OPH7!=@Wjk#Lb|nx9OLlFw z)Z+_BWXKQPuqUrg&gk1=4d&%8ihzjTEUPTu*tRkK@_ z=b#=6b2!d}J&~(GN+dr_c9b3oFO^`&%d^C&9Ii218~=LdZZBP`V(6+Q=$2iy=(%{m}ms^tu^_^ zMy2oBox1O2;1V<+@9g$o8=Vw$_@Oa+6zwJVN;kmiJ>AErmKHifc?I>$*V_?L{QHNz zB8_2P6DiSK_3bxA$lo2pygmr^ed9$D8Q^xMq~u)F0FD#seNetC*+sCjt_Pem+mM;|QkpbIe|Z zF6``a)L5jXMqw!GSPHDhE8;8j&dP4owDwO{I3;$wg;q7y`o`cko6zEdb|QMX?mHLB zixa6Vx-w0bboK-RO?`cm367XUdZMAv-ryCM9*{CUw0gfhQEwHkV7(nK*SD5M|0aje zbN@AS{dA)k8(;V)wHV4@ik__|tw(r!MlbR_`f*VNv#pXdT)7;V1SsC$1pOfj^5{wSmxn6dI=Z!v3P-Kf+ z|0r%?v!02|@a;NaAJU2k%^N+Krb2G?uf0~0-irOIl9Nw+7jENOo7bSG^0`7Kc!~m< z+^`-CGPxT8P0LR7iTI@9sjwgf<3hA%a+!g&sy9QlqY8(^89NQc(eihiww`p4lwt_u zD2@_KrMtTZ26Gk{wtn2YMW9>T^=d{spmWFgG~($zQACN%s@iOU3xJYU(}j6r`EI&6 zDMu-PFuosnV`@dMbgXA69VLa5_k=g;n$51QzG-a_9`1BY>t~K<4^<6we~sghL*0$j zur}P8u5j%>&rlvS`{*)Fa%l0y#Q6M&k6JD`D&mg}*lIC6uoxZ{(<-Z3J0z_8ZLxo9 zMM^nAA}3{PcymS#?fth?Rm$d?Pp+JFC4O+{(fS1z|O@c$w$XjxI}$!+uxmyRryi zxeXN-J^g;?IKMgM!hdRSe-9kPN+~B+RvCUt>MslWck}hjjy?vvd=Mgu#+=e9A9kw| z*d&OzB*<5v9szrFmqxn8Qm5XYhX^p565DN!JQ+dmwW=O$-cH<%?NTJcFE6|j)zR%s z*b;X7CwlRk!C>Lsf+J`0>+6)S-cwCKq*D_#sofJd7O?J2ku)prQ`ag(_BBPIbp`u( z)y8vxgEV@Trnq%(d^*kEo#(G@kfc{Fis1P|y`)5gUM!DAMtJ15o zY|PW_i0X)Zh?pPIq=h{TCJ^LKgsRjD+4{k;xBD*her5ICY|y1l$ODkKtFRf6#F|j7 zKfhp8Z$~RHJ6k;;FDEyLpB^J|xP!P;hy;J3SzFf&sYOBIBnjOn+6N+N&8)xYM!p%D4{!@Kx zP;T$*(X8yNMQg?kDTtLEuYHJht0RlZ@iR%KM=j>iNp+R$gwQ9-lpqZq_NcG1C)&Qz z5u%tlQnFUw!Bu>lR@Mfa6qZCw^&a`#qY+m`BSmq^LQ|u9R}$WPMB6H8_o$AeVfG z7DLZ3bAU|>2sp0$I`q^wn>*CDRD?6r^Ep3dr|jkdbUO<`GmFwErQR!s{Q>nfnI!h| z0mR#P>-YE|oN(^d1}3+H?0S%!B%JxPrlBmZU#B4SMZ(`axSPb z&07*Bd2RBzkt#|{6c%wh6qU?nm5VNrDB@A3)n+JN$*;SbP)8@|Gmr@(_8algl}ITH zbnoVI&Ej%S)!IDaK7#GkZ7GvRIp@a#Z@#f%=M|M>-fm|s4sHw!0JE?ddP6mrh=^<` zxpv|>%4Ct_ha~4s^9U<2Xa2TA*1krgzd9xB54;&ZYazeeG4GF^WK2^0@vf+M! zCoL5jdSipQovNHBB;F|2z?DX62+4c;JBb-Ndm4?m6jQU~;b06qi zN_yIW|IADBw^cD?Y)%aE001g%UQ89;-F)oad@P^=9(LYl7pZB1N@`>G>f8w*TQ!ew z|3h;1nXr$9PkBT|b~uF-ufLfcJA)7LLiLGwV`KZ?U|Q3)eZ^3vT*eW5W9vDZ>lqvc zAok)auid%^4Z%KF2?VVvNT$Oa?G1^stft?)PZ|ezzU5ev*2Bx<@%5&CK+Oux>%E<$ za2jQT5mwU25t_A(Jm1TS`q22veochjeFs&7>Z_5asNQgbTeU`Zx#9$g7+}>~aj*0c z@uFB)co2W3?1_2D=BBD#dB&|gKX!9|AH^9QYnJV%TekY$D+%_0*09?sArk>`pIipJ6Yvi>e-C|&~}9x z7jE|SUjB9ssZhs{0!K!m*vu>zEL!FPcF1IY|lxBvhE literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 3155e22d3ff5d..b4442ceec8df5 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -116,3 +116,24 @@ def test_to_excel_with_openpyxl_engine(ext): ).highlight_max() styled.to_excel(filename, engine="openpyxl") + + +@pytest.mark.parametrize( + "header, expected_data", + [ + ( + 0, + { + "Title": [np.nan, "A", 1, 2, 3], + "Unnamed: 1": [np.nan, "B", 4, 5, 6], + "Unnamed: 2": [np.nan, "C", 7, 8, 9], + }, + ), + (2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}), + ], +) +def test_read_with_missing_dimension(datapath, ext, header, expected_data): + path = datapath("io", "data", "excel", f"no_dimension{ext}") + result = pd.read_excel(path, header=header) + expected = DataFrame(expected_data) + tm.assert_frame_equal(result, expected) From ea18d610bbb62bb5c097b022472680600da6bffd Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 30 Jan 2021 15:49:40 -0500 Subject: [PATCH 02/18] fixups --- pandas/io/excel/_openpyxl.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 56778426f1eef..c935eda7e4172 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -529,8 +529,11 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: # openpyxl may not have the correct padding if the dimension tag is # not specified or is incorrect - max_width = max(len(row) for row in data) - if min(len(row) for row in data) < max_width: - data = [row + (max_width - len(row)) * [""] for row in data] + if len(data) > 0: + max_width = max(len(data_row) for data_row in data) + if min(len(data_row) for data_row in data) < max_width: + data = [ + data_row + (max_width - len(data_row)) * [""] for data_row in data + ] return data From d5215f776393292d794e7e34d8aad779494b1604 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 31 Jan 2021 09:27:12 -0500 Subject: [PATCH 03/18] Added fixes for incorrect dimension information --- doc/source/whatsnew/v1.2.2.rst | 2 +- pandas/io/excel/_openpyxl.py | 8 ++++++-- pandas/tests/io/data/excel/dimension_large.xlsx | Bin 0 -> 4894 bytes ...{no_dimension.xlsx => dimension_missing.xlsx} | Bin pandas/tests/io/data/excel/dimension_small.xlsx | Bin 0 -> 4894 bytes pandas/tests/io/excel/test_openpyxl.py | 8 ++++++-- 6 files changed, 13 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/io/data/excel/dimension_large.xlsx rename pandas/tests/io/data/excel/{no_dimension.xlsx => dimension_missing.xlsx} (100%) create mode 100644 pandas/tests/io/data/excel/dimension_small.xlsx diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index d1418492a44c8..50abb0e872331 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -26,7 +26,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing dimension information (:issue:`38956`) +- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information (:issue:`38956`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index c935eda7e4172..3f7d6382314bb 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -523,12 +523,16 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + # GH 39001 + # Reading of excel file depends on dimension data being correct but + # writers sometimes omit or get it wrong + sheet.reset_dimensions() + data: List[List[Scalar]] = [] for row in sheet.rows: data.append([self._convert_cell(cell, convert_float) for cell in row]) - # openpyxl may not have the correct padding if the dimension tag is - # not specified or is incorrect + # With dimension reset, openpyxl no longer pads rows if len(data) > 0: max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: diff --git a/pandas/tests/io/data/excel/dimension_large.xlsx b/pandas/tests/io/data/excel/dimension_large.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..6a8daf607f299df20579bfc31ff97bea919ae1bd GIT binary patch literal 4894 zcmZ`-cRXBO*BxZE8NJu&J!**Pq7yTW76f7Ro*{_QA|yi)#1kc>OM)OoNz~|%5xoSX zMvEFXO0;hz-}mN;?|JY2-FyF9`<}D**=w(J4D<*t(Em@b@zS06-oe z007~>HV6lIA2HGMcX48`E`bE)pd*>x<^<(=L}Lq@qR>dX~C-I zWB++;e9i0qvt~K*SM~)o#iZ(}puOrwCRxmoT6Z!3LnMS);In1f4C-$B2jlXXl#llm zsvjfLm1(|4l$`op|3Iwy^t3KffBBR1aAQBevogTo4|oU(pj_|lZe6>C?P=LBYP@qkIn*O=5qO?tOz+K#Q~IcHN; z4lf0}!WH2wZQQ2M>vWLS=Nfqr+w&GH=&co;(!extcX`nW_ zh{(gsV5{d`k7Ii%qy^Sgz}j)GQ>Hm(?v)~u5wA-)pI+;BvZh&&_=G-uV}`YfP2LI$ zb%u4unPXl8T`LA&i;ekrB*@LeB;U>ue^FrE7YtQ#&buk3*7JFbXU0DCZ_7c>DHm?* ziW{QutNUT?;O;LLv8bGmGrF;=hk^5{>*-&evPpxZnTnUT7ef#uPKI|hK8I((@meJJySo6&DCstRG!5k=pcn*4Fld}$NKjj|&7VXsicx|W_d8*^LU->k*3IRJXL;4L7gk7Y*HW1T2~nUTqAqf zGxH@}p&T&)0A#<`h>!1mcL$&I6|#P7(({ElWBs(ztp*!Tm&8{z2KUla{G?3Wg!I$= zzJ%euIaRG1e57>Zev`POyYS_k=aBjKgYw?PIEt#ftmtAU;QL!bTmi&dT1uBv__=r$ zD878>3Q>Nn;?ioQ4*Z+5s5IQEzgA^AdiD)IoD580#wk)EsvMA^OWGALN6owVikBvl z)1y#w-&jyW_b;HtN^EfGqJ+9^41R;1#2_h;U0Y>B|D!PBhCDvm7b-JLqUx@BLG7%1 z>e&oHYc)z>>WpQ^b>9a92KzJghHA6p-&iHqR=y_6J_1YJ5N2QdnZSzdtk<~ST3z*TB_bBs<1x*i#HUBsil>a zMymN+)lh1abON<@YrwXT@|#P^f~oz3XkzfAr94o)7?7o_Vc>Ihx2CQZLb#Fq;$Bl8 zRQ#oxU+!Usht>wmz`eZy_{_bwH5*gp&QeMmsgRyUTXW(NO&?I{vsGZWk1nET^I=>v z%o)8Dm|4_f7$(D}%E?oGZDbz}tFD?;-Ve3KV#J62Y8It!P(Ag~ zqso*f2<@n4C1psZe^Ui~0-=%5H!)oIcNnON5xaX?9~m~64%lot8w+<->|4|6)79;I zR@jE#6@NBu8(6)lIDs+xpeI7JLvGRGLNV*PTjiM6xrbHy0sLJOYdXp>7ktPf-X#FQ z2iH6p0j&Qgn?BAC4!%BOKR*)Z)nQ^x0w4&grN49j+zS8!@GdRkl=kPV-gMk^T$VC; z!;Cm^Y4CGm5LBpE8|_%D!c24*i+`l~BuOKJ%RKSji=Y$of+utsfpEJ7fKF>_|CdxL zTBlj)N5tZK)PV73%g2kV z6A!Zeh(5{YTU)oMghy1ZWpFJwe?;{t_j*Kt4j>rS_Zf=mQ7zJ&!i)pbXsOd;fre~! z*l>Q1t zCd^nIF4*XIV|Y(d-1=am69a&0*S1BAI}YyFx3-@W91NCiD1s8as9RO?1i+T7mivJt zt_yXl&>Zh~#@0s??vVi%d{0=gT-72)b<|@NukF4diHnf1chxc2)ol~#nJ*B}#z+h_ z5mKV(vkn0#)xhwL4^+!AVP;oTZZ?l8FhiKrvxIC=!>ki6ctNAR2C-P!vY2kRS4W08 z%0B`tGn(Bc7ND9n>k|pKXxC*`$d5Fs>C>EdFes0l^pItZP!Y*xq?ron+?*Ci_+|xx zbv!7vgI8#3Gp`i;jW>T{E;@2xBtmATHHyW%ro@J{h%lQwoZ8rNYq^hd(Mz}9Zgo($ zJCL*+e#)2*@jPzF#IltoaJNJG%rs*xspcSv6V%x9Jf9bOA}{&&YtiIq?B8|xt3z6k zeG!A#Af8w~&mkR8Y_8He&&X5RRUU>4Pr?F$Yi`RmmC35eFmj|je=S^i*&^%Za59pQ zHkf{bqbi&#EFW&|7bkIbSw8%oScrzJmMwgd86um~?ewe=#Fp$0t-bCniI!H*IyPmD z|B=M+WA%+Vp48T2HES6Y8DYv1h3#{iEfJh_l@~O@jxHiB=PhAV86}}3`}Zr|MZKrX zOWhXs+GiUtOE!;W0nh*qRlqfxD?x%uhNv5JW0FY<*51e5xv!W=6jH)0)~`K~si3ZA zE2&t|Vs($?8?o}6gx%I5<;U>BMrNFKBJx_Ty0SD1K35KQv|QEChD^vFbHBvawIA%9 z223TFHGW_fdcI*Os$4QLy{`FSS%@f$C8@(&-?P4vod)gi zD_3JzP_1fU!or+OkxgxW#PttOzO8>@GVRWpnsMfRT6&rKO&!f>7=wR$wg+0caV{EBiIw9 z`e9G)W`xBgiFQ2>7Zewix!6XzJKsLcgvt1m4BI%eZv=w8NXCl|Le`FXcDCQAY81bn znhe7KlL=M8$kh@Y1M)auinHwJ;_BmUi*T?v@C&EjCfnOdkJVgy1( z6pg=3>_F)qrTu9y$aBA%Gskz%qY=9>y9Q@=$sf<<)uZaIjfZu$ zE)yF+)1(IJ8E{9gqK=?`j~+^GM3Yf82zD791hm2mfF*$2!RVDw#a+m2M$0NZ9glgs z&S$Gw?Uf)`-6FxG9Nrln*6BMWaflTGPck({xtv^Hr`(rQX@ycVU^O6hQhAE{rOXQQ zmy^`sehlhpB4p)=|G;CbVoj4Q(ls+0Pv{+-TTn`cZL^Uj zFSz=_eSLOzGauudvm_*NYQC*#o&x2O!F}*q%{24gU61JJio8mTcT%aahirTE6pdf$ zOcp*X2jB?>nfoPDxtf_9uF{7ryWt(5&M1oBQPbK+orLDO=z8{QGJ|V63H@?Tkig~W zIG6Ft$&es?pA&8PES&!~P*EWw^c22D_Gd<#3~D-8aG40iWrgye8L{>DK2Jz`JoLQc z+BS}8n|9M=L7YjmABvLFNyj-^YJt6`78GYKrn*1Yzk2lT)xF5bIM4}nuf1i2=Hy$q zg0jC!qn*h!!?Gc_WfYR#U_7)Ck~y~BdHg8%Ljq*1l8QQ6LC2-!K*S9&i|#Xp2XqCU zRiAq1bLmnX6ky8w7R^(i#=_OGL-(|7RlP}#nMzc)wIa>!Pt#Sr!4|$r8&<8o&!qf$a&PYmNO*GX>)QfjhZ_klgW^OkW(gDe{9D&ky} z)Go;Gy(;8*zighabZ6@zoKBOdkCQCn;mvXufx|))%sSyy{~BU`%=Z#8-GvCt@s4mJ zk@8y(>9Rzz8+av^vWVmm*<6%6GU!UN@{x7Z%8Is1VTwqGKezQ2Uo|Yg9s6cYI&FMc zHE92~Eder4hmlPdnmq8z)~o%BO^o@#soREul!G9M^3^TD2m>!R?=Nm)S1FJp zghU^&ZKcZR7(G#+317Q>Vog4@B<_cRiU%r$ z|M~B~IA-84$9%ypUmV!QcKLf?&q#kp|1Zt+Vw{W3=69TQoVWkC)^jn!MLGW+AsGBG z;(if+QG0&FDaii;|6e7#h`v~%ztQ}-_k_ny(!#Kofl00;>IO?M&sjwRvsvj6}<4hH}r z!9J~F4(>i;qUZPW)FBOA35uAFkho-cKpm~JeRXT>@DsI=u&n-5+pF2)jD8{T2tu>s zB7!9K(4B#+N;%V1jz8iMs&Oh_C1yhLiN_4^f;_VQRpwSVlL4+lV-0R{XkW>Ar~B@u zR8Qa=X7NuJdL4JHWan&(SQ=yOr|sd99+uX)XZA~bvBi5BXUtK8am;>XW>xX3N=(4K z6^_cakvWt6WRZP-6)~w6N}VBP1EXh5+Rg4_0f%sHLjFnf*R!#Aa~lk+>DM z^YU^N4fJqNFzEF}i&M1j3*lirM66tyGjL53lL6BdFMG37Rp{K}XnM9X^3O1o4n+be zC35L&Br`49_slQ|w`JD$JI{^!J8a*|>o$r;qPo`5P}wYrAx;xerkP^DsLj;C%Jg>O zz>Tq>jS;v5)R;q>WTyTe=t{lPbXdMxvDQG!?dly5Xc&DR&ResedK9-$a!9&lBcl%4 z!Il}EF$sg@SNPWyz-oz|7~}lc?zO^EkuNJalCBOoSy8V?en1?SnxL!`GPi>7JL~r) zn)VifTx&*OicJOdq{zf9jd)EH`d-E|4j0=}l%}vqz z#u5E)@W2`ZL? z9lyxiv2CwCXEp5S1zs15dc71UL!+I`@?fQw?#g`Ia?g4h^>zX41Jr|Mwc%L!Fr} zVH@R$9snTyy+(X|@3}kpoUf4eJJX&|#2H#=4DKMUIb2d-Q0w~TX8B7Qxe03J_p8U%X z&QM5!XaEE3JH9wLEN7#OU1MJf`>Yl!=prU z-%vn8<1t8LB_ZVgqJ*+cJPy)MVvLyEuB$d>Bsxq8S;#B%L}7MGRN1vKxSK^&`8hqv zN{IrLJ!}5-n(qUC-ThfQJ*7GHcNU4Ym2ass(O`+2Li8+X1^r9K{Yxxm+H|f@rCFf*Ctx_ExPSy`X%1f!;Kir*}O}=0@E3zB{exOTLS>(ms@( zTkRNhz3C!X`hFOc|4i8o+6y%R4(#L$4vfw^mdoZe%}BNSJS-UrL;=U6dYfq^ufUZ8 zESo6Qi26XPyG>x*=vN(8q#;xR!PN0MaZ({hU2E3H@SUZs9AZIDv#yTRaq3~vjY-R(=RO*+!Oe)oOnqm> zQczymJH0SzRz(i(#;fCdpv3{CeZ7$u6*KH>l)~5AQ{KxkVU@95<%JGBb7~@;)y@}M zw5gj?f0eSC%XGUvi|7ie>Cv<8p)rt6BH;EYHRlv`unSIGW-TU4+3gn!fG73?$ z9r}BYTW+xqYB%T(2>^o}jjRra5Yt3Y8naEgW$ttr&NgDOpE`vBUhp+kn#jluZ1p4g z92vM4{pe<8&^2NS5Rujtt9oh?oYBGppSJ@)_~ra7*`Y*@uDhhi#LH5-Y86LAh{0ZJ`OInw{^lKOWklbR^#Ye8wCZ%Gw5#v| zX?V1^nTe}%lWR`^@tLAmVUl4qjZ=2F7^Ga9wB7C60H?t#vO_oR(F_t40-Gm;KPQox zdZue$!)g3w!jC@&gXK8b5_)HUzEG1U;qprCk1=@d&5AdPD==jNn)W*^DF{V{TYx-T z!QT6gq64U>4E%mYc&|1i>~e7`Z7x4Kd{k{C@8%m`63naOr#AM$kT@S3RHmZKcAg1w9FTAp|GLVR{lwx|>}sRsTq z6>GHZoLZh=#J&tp{#=;(IanbCqv^?T7o$12`Wf6);aD6}3<*uQPN9z?d zOU3pL)qV;$sJMf;04W|;j)^0e*<@SZh=tyQ>DWRYnG+#q2oXmkde=;E9rQJ4X{*bS zXUE#k$v@NiYAq{3YLUaawI6rqIb)2IsT+%iI;{*!#{8)SA4?A`)b;omwp{l(tJv6a zowAs%fZALg{hrX30x692QUJ;TQ^NuNyk<>}$!46y2~O{Ij{|$RlG0ZkxxH<0R%&J# zP{SJbjt%-RB4Aatab-tqKZ_a$1rXe6YL*d`RF4o;aES=IA~Uzrln9qph1`a^1Cb(B zOZ&lw5*$;YYhyu1I|5H%U^m&@ILeC}W@ooS=naa*3+{6&9aqBN#F*a7&}OEOUvK*` z>}CtLQ(rveVA5ha3U;y;DjQy1B>LcJk%&1V_(Lh@mQ-i+FlZF3t~^+!W%{kMzg_7+ z{n&4Znxf_QP8=M)aYC!I-=$6fjhAVSSFAia0Cwxm4R=bXPS&1HBqm9)>v6bneF4JB`iZNrG9pi3I)G@x+L3J|h{THsU9KCtcFeuAU7vlc zyb?1VjPrLU6agPES70+BhxMgc%YII-KF+o<2YUlwn2V>=FWZqY)K1(X4#uBt*3pA~ z)TX3#ku%&R+6DH}S=wqjr9DKp8c}%7ZSh%eQya`cD?PKWalaAIGkTVsGmhVMM~0Pr z>4U7v|;dBL?+?Is9_n`X994iBq)f(57rW{Q5$k zhNl?s6`vqSEn(-tC-gXlT*kWe`pj2uEwg6?-0c2N-PO%jPSvet;j9cou8+B>y7&QI zuA+w71(_4_ElLmk4DXsv5W{={j~u&0dO!ZdjLYLiu$EUN`MQxN+w_{K47rJP84yiDKoBHtm^H1QQ z<+wx_bnSF#FpkfO8gvfIX92n{A9?>2x<&eTMcQ?n`c|-&2*Or{;-3|<_4Yn5NN%#u zdB=5a9MQDz=E#t6q|Y^!rRNf(IhdP4L&j$0XYY(Pey)G>7~X4n^7QF}dXRhb9Rmah z@4BT9TWJo>Y@rE?6}~MkpXsL0O`}cpm34O>=b`%{E?W95BTfDR zZE;`Shr#&*+AIfoeTbG>2j;_6xDsmIPt{h@o7j-CLSb9=Gh(YvcVS^DWI3cEvj3|) zfH_`aHXhU0ULYP(_jdI&l6a3Md_>GCBRJwrl!r zmh}P7!aP(>3GaLhSs9)HsIfRN!nYGHpS+|1Oxz3WAkFJ;OT2^c+cCx z$MQTit%hCsd8pn#zfkw9#0@0$%| zrjEZf`W3Hf#Lnmv;`UM&_cIc%!V1`SO@pRbue)p?ojJ*kcB^$C{TQ1{hMilaad1~* z=d$!}0XFYdLC1Tq=V_~UwhqE+RS1STNK+ziS2Ob;mJs!><0l0)5%Tr^s1Vawh%`s{ zgcAr?-*L#5AxPK&R@BPCGDBqwV%_1vBIS@HtM-)@HHDHa;imyyRwBMiC>%Ss&8A$M zv zE!J5&s`7YCUXDpUy>yRcv&@ng_sOT7yv^6l`(Id2U3QU+WSvm&10+G-TaJJ{^3I{$ z*KGIk3EHl1Wy|FoJW`$wUt>S9qAWf`FwL3MUqZSjNlu2Qj|J#z0&!^0`R~6RSzycY zkKaEIFfIZwZt?%1h_I!-u;IT5zPR`M0~W#h*mGw7&ra|h%7*)U!2jPPGyFngKM1IK zpn(6M@BWKp2K{!-7u@p2fnAi#KLbl6{uTYdi08#P7scjJoLsE8|2OHm7~vwF|A`O+ z{ugq;2*1cZf8gX~|A7B56J11K?9e}GKJ0PMzx1oFCJ38-0DuU4nP3Iz^*P@G{s-D? BQXv2U literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index b4442ceec8df5..fa67ce95488ad 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -132,8 +132,12 @@ def test_to_excel_with_openpyxl_engine(ext): (2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}), ], ) -def test_read_with_missing_dimension(datapath, ext, header, expected_data): - path = datapath("io", "data", "excel", f"no_dimension{ext}") +@pytest.mark.parametrize( + "filename", ["dimension_missing", "dimension_small", "dimension_large"] +) +def test_read_with_missing_dimension(datapath, ext, header, expected_data, filename): + # GH 38956, 39001, 39181 - no/incorrect dimension information + path = datapath("io", "data", "excel", f"{filename}{ext}") result = pd.read_excel(path, header=header) expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected) From d6c3af1ea172f723d72d9eedc6c264ab1f704c08 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 31 Jan 2021 10:14:09 -0500 Subject: [PATCH 04/18] Return "" for null date columns, trim empty trailing rows --- pandas/io/excel/_openpyxl.py | 19 ++++++++++++------ .../tests/io/data/excel/dimension_large.xlsx | Bin 4894 -> 4920 bytes pandas/tests/io/excel/test_openpyxl.py | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 3f7d6382314bb..6f29f31441c73 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -503,14 +503,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: from openpyxl.cell.cell import TYPE_BOOL, TYPE_ERROR, TYPE_NUMERIC - if cell.is_date: + if cell.value is None: + return "" # compat with xlrd + elif cell.is_date: return cell.value elif cell.data_type == TYPE_ERROR: return np.nan elif cell.data_type == TYPE_BOOL: return bool(cell.value) - elif cell.value is None: - return "" # compat with xlrd elif cell.data_type == TYPE_NUMERIC: # GH5394 if convert_float: @@ -529,11 +529,18 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: sheet.reset_dimensions() data: List[List[Scalar]] = [] - for row in sheet.rows: - data.append([self._convert_cell(cell, convert_float) for cell in row]) + last_row_with_data = -1 + for row_number, row in enumerate(sheet.rows): + converted_row = [self._convert_cell(cell, convert_float) for cell in row] + if any(cell != "" for cell in converted_row): + last_row_with_data = row_number + data.append(converted_row) - # With dimension reset, openpyxl no longer pads rows if len(data) > 0: + # Trim trailing rows that have no data + data = data[: last_row_with_data + 1] + + # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: data = [ diff --git a/pandas/tests/io/data/excel/dimension_large.xlsx b/pandas/tests/io/data/excel/dimension_large.xlsx index 6a8daf607f299df20579bfc31ff97bea919ae1bd..3aa085289eeb830a48653c4b9a650b2a345be065 100644 GIT binary patch delta 1160 zcmV;31b6$MCb%ZBN(K`tQ9n`z^K)4n1ONcw3jhEZ034H#5gQ2!00SP#7GNI8lYs^z ze^ZjmE)u8SE|A7SoVLKe(IRCLp-7da>?U8oL+TpID1u^<07=A|;o&(mlxCt&zcS_= zh*VVYg&T}~*C9L>nDYI?{r0f_aOOHnBaRUhoGjcYqTEl*chOPEN0kzy9RT2J;ig*W z7oMl$lw?SaL{2z(5+O6BVJY{X$|XVAe{RZ{H}?Ixmm$jC65&F=216tXjmfo$i;Qqx zf=I%U2Kp+cxvJ5A;aA~diH;z(=F_2gU3wb4U^>9dXe@;ii5|rwtEdfCo_psnbY}5u zh-FAV7WoGN&OzLcGOC{}M|T-n@H^=wlxZTr3Ji5g5K{zvIogvg(cg0G*ALYET7@tu;R>O4Ct6~{FYEErNXo;Ch0DZZhcfK|() zGTw{hO0X}G3AkEa$G-$pRvRSUr$F(BB)WmrXtyPd#G2slzD=PS9I}1Nc7inkm?WrR z+E9SwgsklcV7zd-$tMFyk((n{e}plk({*AK`w!sHr>^rXM7E6)gDeHU-&x<3RVE+{KPc8I@Crt=nIE17<5`m@sLB^&at`kC2;S^xs77t6fATNNMBk7D zVGwT#b!~7$sn2^spyG3lG+IVd937~I$#kK#$jYGR2y@zaJo159N--w9@s#1eVPxq* z`54NTxghnmkIi|ew8|a}n-y3OOF!}sKsAP6Ak{FN{DxUDF`4;)iA^lAafevY5VH|i z!-(vqe@WAL=QGLKD6)FG7=b<1I+S0%JPpZ`}h6NLgz5{`Yov#=Fb;{ZGqx@j<9Zoe^R@0N{GzXU53$Akaxha-;`B z5)PhJs2mwss$HO^A_%*GmvR;i!*Ckph;qL~I9IR15NS#iawU=?C)|`EiZEn=zRsxB zHQF!yDm<*v0aC5`v?yMco(3-(_waI>D4|7a28qZkYW*zFg0mNS=E-Y__w$IH+D;#+ZVH zw(OoF>B7H=&Ogs>gbVmi$w9ZCJ)MceHSjAK(lwkj{;ep!p`3u#sG>67i^EE=FCYY* zthVD{0*ciJMRyr6ydkM+APw4X2qTFhxc%O`Fbp=?Jmp)#8URdER4`*Hz;;5__B}9O z_}t3L08&W1#fmV0X4CY&#K!&u_|viPJqwX<62w5HC=6TcTlad`YUR8^Poi+LR9^cr zY=wBVCW{-|BDpNAyb{?Dr()rI2&{Wjvbl`f>+)4k|8xZbUsEp&+g;l!HM&xBsvxVJlI2?pPETb3`-gwIJ-!O7?pnMETZ4XF&>Jz)4 zEvm(PnZU(6+H*4v-wU Date: Sun, 31 Jan 2021 10:30:40 -0500 Subject: [PATCH 05/18] whatsnew --- doc/source/whatsnew/v1.2.2.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 50abb0e872331..ab952485d29fe 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -26,7 +26,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information (:issue:`38956`) +- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information (:issue:`38956`, :issue:`39001`) +- Fixed bug in :func:`pandas.read_excel` sometimes producing a ``DataFrame`` with trailing rows of ``np.nan`` when the engine ``openpyxl`` is used (:issue:`39181`) - .. --------------------------------------------------------------------------- From 8cd7aade9a3410656227b01b3af4458e65820b12 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 1 Feb 2021 17:30:58 -0500 Subject: [PATCH 06/18] Removed fix for 39181 --- doc/source/whatsnew/v1.2.2.rst | 1 - pandas/io/excel/_openpyxl.py | 6 ------ .../tests/io/data/excel/dimension_large.xlsx | Bin 4920 -> 4895 bytes pandas/tests/io/excel/test_openpyxl.py | 2 +- 4 files changed, 1 insertion(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index ab952485d29fe..ab2838cd7918e 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -27,7 +27,6 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information (:issue:`38956`, :issue:`39001`) -- Fixed bug in :func:`pandas.read_excel` sometimes producing a ``DataFrame`` with trailing rows of ``np.nan`` when the engine ``openpyxl`` is used (:issue:`39181`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 6f29f31441c73..1b706e062a3b2 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -529,17 +529,11 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: sheet.reset_dimensions() data: List[List[Scalar]] = [] - last_row_with_data = -1 for row_number, row in enumerate(sheet.rows): converted_row = [self._convert_cell(cell, convert_float) for cell in row] - if any(cell != "" for cell in converted_row): - last_row_with_data = row_number data.append(converted_row) if len(data) > 0: - # Trim trailing rows that have no data - data = data[: last_row_with_data + 1] - # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: diff --git a/pandas/tests/io/data/excel/dimension_large.xlsx b/pandas/tests/io/data/excel/dimension_large.xlsx index 3aa085289eeb830a48653c4b9a650b2a345be065..d57abdf2fbbaea74548d94ff72dcfc1468fa24a7 100644 GIT binary patch delta 1170 zcmV;D1a14cCZ8s-N(K$Ei$PK;MdH-k1=2W((-zn_TBIx@6v>j5-Q??cNL{Q* zMiCT?1V|#zj1JG7p*RzN`jsJte zAV-w@CBlVz4TeZlnviRe6glCh1W|+`1N3!9rLNI_;aA~dg$|Hv&8J22y7V-7(YS|~ z(?kg^QZq6*B(!Ds6iMg)Wpwd*9NG{U@tu-`ZasTC z6NeQLEEv)?o-_WfD88YbfYqp?GTw{BQm`+e1RSll<6i=j)dod(8Bn|-sc9e$+HMFV ze~BTu{ocAT3^v(3mec0$$mJusg8-0H~yQb@bSk}ziT^u5H!{sZ{a zvF|+#k#7>jK&2=QTkBi*de>_8yg^T*aI#ch`!#HZc(f*q8`>(lEUdl~*)OMJ;d=tGB=`Tv5Z^)i7htHxv5_OjPbODJj9U@oPICQ- z+3DmaZ=Bqumt3bVSCby~Nhi5}#M$ZOrf;0ww3pndliajNecDN`A93DEE@<5hQc-T! z(zRr%`YgCXYzZ>}QOCUOAY*9KcLZf){EF0$!p6)s zoNzEZn@y@NbX>wc?FOOAaMt{Q@>^j*>QkP~b-5!zDuijRgQh7L5?X;IYD1riyQXxj z?W9y`W1Guuk#9+Pd%6Z*-{;k`xxjtT!mgFu8;b*f$OvD90&oHp1p(Ype>C@{P(~pN zjc3bH@_5PdkBpjTCh%YcYPH`4+R#$ucDmBGgL1GHnyk9)x}~_56i$*&q^`lyk_btO zyO=?XQmvKK$ip;Ep-$X5{3E5>yvF$R{&?OOvB0<-Di-fiEMJHU4lr$kOulHrMarne&JW)VTq0)wdT{I zcwKrLykI)O%V;cx5{VwgBCDtkRi1n2FLY+{Ylvk?J{EuZ2LR4N+>SD;pDagr8Cmc< z=_HhCBEJd@bx9CY1bjK#lP%HTa_iR*;tmY8+gnCnwRss~3Mn*YcO*#`?qzWCdFq=G z8}Xfzqv||6Diy~y5G)u{HJ&y8Eh)aCoPbr!qB7o#<4UkEkO{b2UB|x!QdS!z-KRkD zh9tUy)M$UVC5*(H;O@Rnp&1;qead!%H2|0-s9@Sqfa8R$?FV4IaJk7R14xmZBUXel zqtkU_6Z;R~&!?{QEJU`A5rZrRzTa8j+U$Av5`|4m}RM(ZK?7v%0%Cg17Q$v33Y97LaEPtL7?JujWk+DQXCzq zg~@cGw8+Y!<_L4zcs%lfSV}P_yz!LbzhPwQK=~NTmAN4GwU5ntrnJf)3!4>K4@*Dt z4nTi3hF>7nFq`~_Suin~`G1K`EU|HiSkMr&5m&>A@ym$SFk*6A-1LpbO)asBCB{#R zD4}>UsLy-ih7sqd z#Rc>F+^eA66q@f_iPE*a+!eOF!ojYvzsrB?e1&pJx!&YtuNQBUxwbu*)MGkEzCvr=liTxz1PK!&ZRN6GM+alYM@=I-pg}Q%t zRLiDWxQ>C{NP9LGNB)o!zJUzDZIcudUzu-^HA3ndw3b-pl-MQ!xhS)>wmEVzO%kXR*A~B} zRGT*#e?GMD)-n^Axu@O1~tqPN{3LKLO5-I_=lQa@R0hg0@5<{9 diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index a9ad4229cf949..fed8e2923382b 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -136,7 +136,7 @@ def test_to_excel_with_openpyxl_engine(ext): "filename", ["dimension_missing", "dimension_small", "dimension_large"] ) def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename): - # GH 38956, 39001, 39181 - no/incorrect dimension information + # GH 38956, 39001 - no/incorrect dimension information path = datapath("io", "data", "excel", f"{filename}{ext}") result = pd.read_excel(path, header=header) expected = DataFrame(expected_data) From b70b65d789162c771442f95a75a61da38ca004c9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 1 Feb 2021 17:42:38 -0500 Subject: [PATCH 07/18] BUG: read_excel with openpyxl produces trailing rows of nan --- doc/source/whatsnew/v1.2.2.rst | 2 +- pandas/io/excel/_openpyxl.py | 6 ++++++ pandas/tests/io/excel/test_openpyxl.py | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 095d44bb84590..f2c7f139021a8 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -31,7 +31,7 @@ Bug fixes - :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`) - Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information (:issue:`38956`, :issue:`39001`) -- +- Fixed bug in :func:`pandas.read_excel` sometimes producing a ``DataFrame`` with trailing rows of ``np.nan`` when the engine ``openpyxl`` is used (:issue:`39181`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index fc567d5028777..1de4bf5730a56 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -531,11 +531,17 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: sheet.reset_dimensions() data: List[List[Scalar]] = [] + last_row_with_data = -1 for row_number, row in enumerate(sheet.rows): converted_row = [self._convert_cell(cell, convert_float) for cell in row] + if not all(cell == "" for cell in converted_row): + last_row_with_data = row_number data.append(converted_row) if len(data) > 0: + # Trim trailing empty rows + data = data[: last_row_with_data + 1] + # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index fed8e2923382b..98e45a7f18f96 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -141,3 +141,17 @@ def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename) result = pd.read_excel(path, header=header) expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected) + + +def test_read_with_empty_trailing_rows(datapath, ext): + # GH 39181 + path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}") + result = pd.read_excel(path) + expected = DataFrame( + { + "Title": [np.nan, "A", 1, 2, 3], + "Unnamed: 1": [np.nan, "B", 4, 5, 6], + "Unnamed: 2": [np.nan, "C", 7, 8, 9], + } + ) + tm.assert_frame_equal(result, expected) From ea150b39601b243bb248ee74820ac5f576990189 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 1 Feb 2021 18:11:17 -0500 Subject: [PATCH 08/18] Add test excel file --- .../tests/io/data/excel/empty_trailing_rows.xlsx | Bin 0 -> 4900 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/tests/io/data/excel/empty_trailing_rows.xlsx diff --git a/pandas/tests/io/data/excel/empty_trailing_rows.xlsx b/pandas/tests/io/data/excel/empty_trailing_rows.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..920b03915a3c8f1800bc41e76dfdb75e783762d1 GIT binary patch literal 4900 zcmZ`-1yodByB=cbE`cGWOIiVG1c^bqL2~FAaA<~ZkrIP&Xhd3+PH6!dq*X>>NRg0~ z5P>`By}zH_@1L{goVCtc&wKWM-~B$%v$w7$4lWGuFW&Yue&Ro~eg~XAGU$N@j{M_=dov2+T{L z!3Q%mosYf6~JXbIA_ZL2Yg(NRDA+Fef==uxz5oegA42 z6zerekP%1)JwgO>y=!_V8#AOKY5S`_+T^(t}y&*^?yWxWnl5>e6-9IR7Op zY}K2i3nm4rf``1S!s4xzI`35sjPe+@Tik{HkKx(`yi*ph(DC=O8x3m`Gv4^hHpRiR z6{x>NSDks^Y$Q-kIcrVR`ufg!vSXCTSplH?S0Ww+h;JTZ0RT`O0Dv5mhyYAXPnesX zr>C1xfQNgM!2lE`LeX|8fII6UXypoeier+T3YejIF_4+2O6Qh9)4%hy;CzD~<#L^?zCJ-Z1o-CSu%$mSDrZDzL! zDL)>xH41lt7_&(b&ou-BZ!{>+gcqonY9SxqsoC>@gkP!0?lAwMM{(ygi?~-hI_8*_ z#PTV7T+%T44c-k!67}SsS>u9N?seB%DZZfpoc2?FRM1l9X7(4Se4>y9hRW65l~C9RCq0|n#E4|a#aVxRI}_U> z$9Y*hmfekfX2T(FEIEOgSE~t9G}_rr5o>jHH2LG2PHHpi%|lCS#c8@Z@Z`iZ*AQ z5=DKrS6>RbSi`65gmZ6KU7BpV*NIqYmCz zkE}6tnwFSOSXu?sT4bn(+FU`t=MvW7W{ffX8)v=WJQ|+a`qnh`}$>jC-m=Ki=zy?!(_5cDkHF?|&9(FES^7-%V zp$c(|E!)`Q5C5~>~=fRaUxE;-nxgQvEc&kMch))6w#|fDy~JreN37v`B#8e z$`rs%v_;NMp9o&vLo}V9@&f7`ljz3Umo%wZ646@%SC~+W`d3SbR+%ca7fKGU?vsiw9ts%fGhi3 zHdCk*4g%Hon@MbAOS-B_L#X_NsS~ji#69A<28b{++du&#>Di+@B{~--Z)+(ogTRYQe)7vjN-P=bs`SiASzu$DgR_QC`TI3^W8PWMsLyyzM`jT#qU%fN@N)MnFHiR`>|gsCbB29}(g$}WyS zzM62iG8<$?A&&Kg_vN@Dd3T{sXAN=7apH61vFc!4b&0*)i~|MaoYr9;i|=5@4u#@c zW-j9+Yut!rgQt`;Q3#EF&O9+YuhRO6VOov}k3p3Y^6q=7ltQOe^S9dYaMGgRZZIye zL&hFoO(3*OZPyO0a;vr8HwrL(C|7L0=c3mtBu8p>eFN+D896!Cdm)9{T6) z9J8N{b_{wcx6iO5yzfA}fs#7Og$lXS+cL{scj=4Uv3r|2Rh(nn5$>O|GCiC2p5Zqt zD{qpmjOR_u=ISR?O}~aGaEYIbH6O{6c*H+A8vbO}kRzAf6?z|+Sji{pOJbOSwr8Wg z^Nq5S#KUUq@S{jDdeT+q%z@X;LpS&tdoSo|2HsdIGK-K7q>4H*j-d?b|G|d}cLwlk zsE`&N>r$O zUC?YQ4=>`{lMn5rpl~&}*Nyp=Y45t$o>8By7+wob$@6^+mN3zSTSC9@>zGjP-vHKh zY<_9JuIcYkJ@tgDq>EfzE1|Vu)|p0oxZtd0l(2j^#tb<+vo-(hG;y*-@X47jQAoGn z9h+8;iTZg$htN)9Y}#BmyZ+!Zn=M+T=z z+$BtNgDb7Mv7+|eUFu}$B;9Jit;}El$TmqI_Rz4~eH-LB+QGkfP6d*(JEeP7!nVR* z)l7MsPWb@ZgZ6=SOKL+`<4KuB1I3!e>^>&Yvv7S#%0OsC_xlEtqz9NMmt80F<)y(@ zpnHDOO*fHMrflPB0kz^GS)|+o%{fA6rkD1cTAcR&~!A8n0R} zZ0u(qH%NrHi%(~*c^&I2mPn=V2no*ZWIlPu2-8!XJJ?z!Mha^h1B`Uox6OG5V`RZB+Lhq|8si8oB&|WE1x_tgK)1!a>%1EBft1X;5S}a2 zPfCb{u{;zX)lacQ3kZwh?|U3C%PlLgGk@e5d>NIiFX2x(Y3<0e6-4Ywh^o{L-T2A5 zx7(0;yYl7iOfdG}nNR|JxL$?HfDFc+VvPF*#d3t_rmk0?S zxs@MtImk2xZ$sPkj+#U`M38MmJp*%4>=SC3{rBP;x3#+W*|gn;eq+qFBhi2Zmf zqjJB^%5V~_#!6uLNR^6MQ3DJdldU*>&JYf#Mytn>4?QRi@);-#X zTlW`>WofudaZ7mk+3E;-kRMSK6f#dYB{t_4Ikn95@Htuiocd~7tek4vDk7Qb1zewS zQ1$WxdR>JKGYiuvWm}aad=2lJPZGg=01q5{jrx2BU2)zQPuU;C*b2fy@ur*#Nsj{4OK_Qn@;q%$7GI2DCGLY(V zO@<0?ZY^12?G<64p-g=u{}C@e>D0m?&t6{lEN<5{?X?r0BafZh4OP+@*W3gw{?`x= zK5<3n?G8{;NK-_h77L4sx8a=yLP7`?_f7(5sltcxLz466ImUx~9tlt7xa3!CGAXBz znGcr8JHF5wEl()~VDSf=`K3|1nwaUW(}jL@!}@uKmJ_m3R@;p~4J&d1L*J`1k~9zE z`4ybPfnO7nT~Kv1p~2YRr|OUe2+v)hoNRR18Dxj_?}~KlHV>|0DiMUK3dKJwV(aB~ zQIPCZor{j^-8!M^+|QCCW_!HQSn)WU2*n0!0lqgjCqM5t2LIUn;xTg2`Yb2sNF&I- z#nb@7#=U8&!&06_gDx_eW`^%d$fmpLbJA!NE;8>gVc&N@#(^ivE2@x|_T#1p!ETX& zz>j1jXiEp{-wiJn(q=fw>MLlOcg?>06sbHt;j3n=zT1L_%NTL5g}iE z{J`t2_H-RT2~-KUS=8RxD&o#+24@~IM^P1Qx3*DJc)(qHkj{cwzSC;cTR+a`CH>xQ z(ge8cHRp=QeS9ol>->&^ua;=5_jZmVX;tw@*hn8n-Khcb9+wdgY~rQ(Hxuv-e6JD) zFGpLT`XlkL)tEYDOW`MNVO7;h!O}ye3gg}3!Ge_vCsv(nYwC(+8P{_BIjjVIl&7)n zShkz9X;O!ph!5}D;=q$N=$WN-(#MKzJ^KW$6V1lX+_rSZ9r%bTicR^Vbv>EA=H0^c z7^kZY@0IGTo>Y5ukXO#8om~wi-mb9Z#(DN}FL(PD=y0Ct%w@lQg{%kSb<~cZ+rj3a zOWyN7`xQ$V9{wB2olKbmg9j?;$PLz0E6UPy1ml9mm8jxjC+FK_h!pa?Ogy|nGW48FVr`~w!mxY-M4{?C5!0?LB(d%*wSDbxQ# zVm=6{jZnn<&v*aDHv@nB=1ZRW^1v?3<)49N68(z)U&QlroXcYKCr%;8;s2ZTT#j&= z&i_P+BKa3`zYM?3J%8Y|WdDHwFB4rxU+&O9XhF=O&cF1lt|kzZeE@(EbD3ZSsNsU| G0RICPPjXKH literal 0 HcmV?d00001 From b8871839ccfc189eb887d40fa24bf874a14235b2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 3 Feb 2021 18:30:13 -0500 Subject: [PATCH 09/18] REG: read_excel with engine specified raises on non-path/non-buffer --- doc/source/whatsnew/v1.2.2.rst | 2 +- pandas/io/excel/_base.py | 22 +++++++++++++++------- pandas/tests/io/excel/test_openpyxl.py | 8 ++++++++ pandas/tests/io/excel/test_readers.py | 9 ++++++++- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 0ee1abaa2a0eb..dc27ceba138c9 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -21,7 +21,7 @@ Fixed regressions - Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`) - Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`) - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`) -- +- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 213be7c05b370..1d77cc12caf8e 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1069,26 +1069,34 @@ def __init__( xlrd_version = LooseVersion(get_version(xlrd)) - if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): - ext = "xls" - else: + ext = None + if engine is None: + # Only determine ext if it is needed ext = inspect_excel_format( content_or_path=path_or_buffer, storage_options=storage_options ) - if engine is None: # ext will always be valid, otherwise inspect_excel_format would raise engine = config.get_option(f"io.excel.{ext}.reader", silent=True) if engine == "auto": engine = get_default_engine(ext, mode="reader") - if engine == "xlrd" and ext != "xls" and xlrd_version is not None: - if xlrd_version >= "2": + if engine == "xlrd" and xlrd_version is not None: + if ext is None: + # Need ext to determine ext in order to raise/warn + if isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + path_or_buffer, storage_options=storage_options + ) + + if ext != "xls" and xlrd_version >= "2": raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) - else: + elif ext != "xls": caller = inspect.stack()[1] if ( caller.filename.endswith( diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 3155e22d3ff5d..7d24de2a17529 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -116,3 +116,11 @@ def test_to_excel_with_openpyxl_engine(ext): ).highlight_max() styled.to_excel(filename, engine="openpyxl") + + +def test_read_workbook(datapath, ext): + filename = datapath("io", "data", "excel", "test1" + ext) + wb = openpyxl.load_workbook(filename) + result = pd.read_excel(wb, engine="openpyxl") + expected = pd.read_excel(filename) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b2e87de5580e6..a594718bd62d9 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -2,6 +2,7 @@ from functools import partial import os from urllib.error import URLError +from zipfile import BadZipFile import numpy as np import pytest @@ -685,7 +686,13 @@ def test_missing_file_raises(self, read_ext): def test_corrupt_bytes_raises(self, read_ext, engine): bad_stream = b"foo" - with pytest.raises(ValueError, match="File is not a recognized excel file"): + if engine is None or engine == "xlrd": + error = ValueError + msg = "File is not a recognized excel file" + else: + error = BadZipFile + msg = "File is not a zip file" + with pytest.raises(error, match=msg): pd.read_excel(bad_stream) @tm.network From 601ad87cf02d2ddc68a026fd01ed87ceb24505c3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 3 Feb 2021 18:37:51 -0500 Subject: [PATCH 10/18] Restore special-casing for xlrd.Book even when engine is None --- pandas/io/excel/_base.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 1d77cc12caf8e..84b5cae09acce 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1072,9 +1072,12 @@ def __init__( ext = None if engine is None: # Only determine ext if it is needed - ext = inspect_excel_format( - content_or_path=path_or_buffer, storage_options=storage_options - ) + if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + content_or_path=path_or_buffer, storage_options=storage_options + ) # ext will always be valid, otherwise inspect_excel_format would raise engine = config.get_option(f"io.excel.{ext}.reader", silent=True) From d835dffdc779d90f6857432c3cd6f36ba4bf4b90 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 3 Feb 2021 18:44:20 -0500 Subject: [PATCH 11/18] GH # in test --- pandas/tests/io/excel/test_openpyxl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 7d24de2a17529..bc2aca862546b 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -119,6 +119,7 @@ def test_to_excel_with_openpyxl_engine(ext): def test_read_workbook(datapath, ext): + # GH 39528 filename = datapath("io", "data", "excel", "test1" + ext) wb = openpyxl.load_workbook(filename) result = pd.read_excel(wb, engine="openpyxl") From e6684e9a0f0f3d953ac05de3f7d68ddc327902d4 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 4 Feb 2021 18:19:24 -0500 Subject: [PATCH 12/18] Added wb.close() to test --- pandas/tests/io/excel/test_openpyxl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index bc2aca862546b..d426811c43efa 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -123,5 +123,6 @@ def test_read_workbook(datapath, ext): filename = datapath("io", "data", "excel", "test1" + ext) wb = openpyxl.load_workbook(filename) result = pd.read_excel(wb, engine="openpyxl") + wb.close() expected = pd.read_excel(filename) tm.assert_frame_equal(result, expected) From ba2bc75a36d594ffec889e819e120a8e2aa70ea2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 5 Feb 2021 16:44:55 -0500 Subject: [PATCH 13/18] Added logic/tests for determining if a sheet is read-only --- pandas/io/excel/_openpyxl.py | 6 ++-- pandas/tests/io/excel/test_openpyxl.py | 39 ++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 64c64b5009b0c..6bc742f6d9577 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -533,7 +533,9 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: version = LooseVersion(get_version(openpyxl)) - if version >= "3.0.0": + is_readonly = hasattr(sheet, "reset_dimensions") + + if version >= "3.0.0" and is_readonly: sheet.reset_dimensions() data: List[List[Scalar]] = [] @@ -541,7 +543,7 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: converted_row = [self._convert_cell(cell, convert_float) for cell in row] data.append(converted_row) - if version >= "3.0.0" and len(data) > 0: + if version >= "3.0.0" and is_readonly and len(data) > 0: # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 830da757d23e8..3bc502e8b33c6 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -151,10 +151,45 @@ def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename) tm.assert_frame_equal(result, expected) -def test_read_workbook(datapath, ext): +@pytest.mark.parametrize( + "header, expected_data", + [ + ( + 0, + { + "Title": [np.nan, "A", 1, 2, 3], + "Unnamed: 1": [np.nan, "B", 4, 5, 6], + "Unnamed: 2": [np.nan, "C", 7, 8, 9], + }, + ), + (2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}), + ], +) +@pytest.mark.parametrize( + "filename", ["dimension_missing", "dimension_small", "dimension_large"] +) +@pytest.mark.parametrize("read_only", [True, False]) +@pytest.mark.xfail( + LooseVersion(get_version(openpyxl)) < "3.0.0", + reason="openpyxl read-only sheet is incorrect when dimension data is wrong", +) +def test_read_wb_with_bad_dimension( + datapath, ext, filename, header, expected_data, read_only +): + # GH 38956, 39001 - no/incorrect dimension information + path = datapath("io", "data", "excel", f"{filename}{ext}") + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl", header=header) + wb.close() + expected = DataFrame(expected_data) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("read_only", [True, False]) +def test_read_workbook(datapath, ext, read_only): # GH 39528 filename = datapath("io", "data", "excel", "test1" + ext) - wb = openpyxl.load_workbook(filename) + wb = openpyxl.load_workbook(filename, read_only=read_only) result = pd.read_excel(wb, engine="openpyxl") wb.close() expected = pd.read_excel(filename) From 1381eccfa8e8ca19fde8dd0c483c33ba7a43fd32 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 5 Feb 2021 16:51:19 -0500 Subject: [PATCH 14/18] Added comment --- pandas/io/excel/_openpyxl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 6bc742f6d9577..274b18b2605cc 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -533,6 +533,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: version = LooseVersion(get_version(openpyxl)) + # There is no good way of determining if a sheet is read-only + # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605 is_readonly = hasattr(sheet, "reset_dimensions") if version >= "3.0.0" and is_readonly: From a3db3eb54b4439edbbd4ac49880649116d3e9288 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 5 Feb 2021 17:20:43 -0500 Subject: [PATCH 15/18] Combine and reorg tests --- pandas/tests/io/excel/test_openpyxl.py | 61 ++++++++------------------ 1 file changed, 18 insertions(+), 43 deletions(-) diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 3bc502e8b33c6..2f901bde0ba09 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -122,32 +122,14 @@ def test_to_excel_with_openpyxl_engine(ext): styled.to_excel(filename, engine="openpyxl") -@pytest.mark.parametrize( - "header, expected_data", - [ - ( - 0, - { - "Title": [np.nan, "A", 1, 2, 3], - "Unnamed: 1": [np.nan, "B", 4, 5, 6], - "Unnamed: 2": [np.nan, "C", 7, 8, 9], - }, - ), - (2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}), - ], -) -@pytest.mark.parametrize( - "filename", ["dimension_missing", "dimension_small", "dimension_large"] -) -@pytest.mark.xfail( - LooseVersion(get_version(openpyxl)) < "3.0.0", - reason="openpyxl read-only sheet is incorrect when dimension data is wrong", -) -def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename): - # GH 38956, 39001 - no/incorrect dimension information - path = datapath("io", "data", "excel", f"{filename}{ext}") - result = pd.read_excel(path, header=header) - expected = DataFrame(expected_data) +@pytest.mark.parametrize("read_only", [True, False]) +def test_read_workbook(datapath, ext, read_only): + # GH 39528 + filename = datapath("io", "data", "excel", "test1" + ext) + wb = openpyxl.load_workbook(filename, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() + expected = pd.read_excel(filename) tm.assert_frame_equal(result, expected) @@ -168,29 +150,22 @@ def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename) @pytest.mark.parametrize( "filename", ["dimension_missing", "dimension_small", "dimension_large"] ) -@pytest.mark.parametrize("read_only", [True, False]) +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) @pytest.mark.xfail( LooseVersion(get_version(openpyxl)) < "3.0.0", reason="openpyxl read-only sheet is incorrect when dimension data is wrong", ) -def test_read_wb_with_bad_dimension( - datapath, ext, filename, header, expected_data, read_only +def test_read_with_bad_dimension( + datapath, ext, header, expected_data, filename, read_only ): # GH 38956, 39001 - no/incorrect dimension information path = datapath("io", "data", "excel", f"{filename}{ext}") - wb = openpyxl.load_workbook(path, read_only=read_only) - result = pd.read_excel(wb, engine="openpyxl", header=header) - wb.close() + if read_only is None: + result = pd.read_excel(path, header=header) + else: + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl", header=header) + wb.close() expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("read_only", [True, False]) -def test_read_workbook(datapath, ext, read_only): - # GH 39528 - filename = datapath("io", "data", "excel", "test1" + ext) - wb = openpyxl.load_workbook(filename, read_only=read_only) - result = pd.read_excel(wb, engine="openpyxl") - wb.close() - expected = pd.read_excel(filename) - tm.assert_frame_equal(result, expected) From becd2cffb7e8b1e9c8508e8613128e4ce3ba1ff0 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 5 Feb 2021 17:23:46 -0500 Subject: [PATCH 16/18] - --- .pre-commit-config.yaml | 11 +- asv_bench/benchmarks/io/csv.py | 4 +- asv_bench/benchmarks/rolling.py | 10 + asv_bench/benchmarks/series_methods.py | 4 +- doc/source/user_guide/style.ipynb | 11 +- doc/source/whatsnew/v1.2.2.rst | 6 +- doc/source/whatsnew/v1.3.0.rst | 23 +- pandas/__init__.py | 2 +- pandas/_libs/hashtable.pyx | 3 +- pandas/_libs/hashtable_class_helper.pxi.in | 45 +- pandas/_libs/hashtable_func_helper.pxi.in | 2 +- pandas/_libs/index_class_helper.pxi.in | 7 + pandas/_libs/khash.pxd | 3 + pandas/_libs/src/klib/khash_python.h | 10 + pandas/_libs/tslibs/__init__.py | 34 +- pandas/_libs/tslibs/fields.pyx | 151 ++ pandas/_libs/tslibs/timedeltas.pyx | 19 +- pandas/_libs/tslibs/timestamps.pyx | 158 +- pandas/_testing/asserters.py | 13 +- pandas/compat/_optional.py | 2 +- pandas/conftest.py | 36 +- pandas/core/aggregation.py | 218 +- pandas/core/algorithms.py | 17 +- pandas/core/apply.py | 218 +- pandas/core/array_algos/replace.py | 23 +- pandas/core/arrays/_mixins.py | 4 +- pandas/core/arrays/boolean.py | 3 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/datetimelike.py | 10 +- pandas/core/arrays/floating.py | 3 +- pandas/core/arrays/integer.py | 5 +- pandas/core/arrays/interval.py | 12 +- pandas/core/arrays/numeric.py | 3 +- pandas/core/arrays/sparse/array.py | 6 +- pandas/core/arrays/string_.py | 2 +- pandas/core/arrays/timedeltas.py | 2 +- pandas/core/dtypes/cast.py | 7 +- pandas/core/dtypes/concat.py | 50 +- pandas/core/dtypes/missing.py | 6 +- pandas/core/frame.py | 10 +- pandas/core/groupby/generic.py | 12 +- pandas/core/indexes/base.py | 70 +- pandas/core/indexes/interval.py | 32 +- pandas/core/indexes/multi.py | 10 +- pandas/core/indexes/numeric.py | 9 - pandas/core/indexing.py | 8 +- pandas/core/internals/blocks.py | 153 +- pandas/core/internals/concat.py | 93 +- pandas/core/internals/managers.py | 27 +- pandas/core/nanops.py | 2 - pandas/core/reshape/reshape.py | 22 +- pandas/core/series.py | 18 +- pandas/core/sorting.py | 2 +- pandas/core/strings/__init__.py | 4 +- pandas/core/window/common.py | 16 - pandas/core/window/doc.py | 119 ++ pandas/core/window/ewm.py | 311 +-- pandas/core/window/expanding.py | 521 ++++- pandas/core/window/rolling.py | 1775 ++++++++--------- pandas/io/excel/_base.py | 31 +- pandas/io/excel/_openpyxl.py | 24 +- pandas/io/formats/style.py | 77 +- pandas/io/parsers/base_parser.py | 37 +- pandas/io/parsers/c_parser_wrapper.py | 8 +- pandas/io/parsers/python_parser.py | 70 +- pandas/io/parsers/readers.py | 39 +- pandas/io/stata.py | 22 +- pandas/tests/api/test_types.py | 3 +- pandas/tests/arithmetic/test_period.py | 3 +- pandas/tests/arithmetic/test_timedelta64.py | 78 +- .../tests/arrays/categorical/test_warnings.py | 13 +- pandas/tests/arrays/sparse/test_dtype.py | 4 +- pandas/tests/arrays/test_datetimes.py | 2 +- pandas/tests/dtypes/test_concat.py | 75 +- pandas/tests/dtypes/test_missing.py | 10 +- pandas/tests/extension/arrow/test_bool.py | 5 +- pandas/tests/extension/arrow/test_string.py | 2 +- .../tests/extension/arrow/test_timestamp.py | 2 +- pandas/tests/extension/base/__init__.py | 28 +- pandas/tests/extension/base/casting.py | 3 +- pandas/tests/extension/base/constructors.py | 3 +- pandas/tests/extension/base/dtype.py | 3 +- pandas/tests/extension/base/getitem.py | 3 +- pandas/tests/extension/base/groupby.py | 3 +- pandas/tests/extension/base/interface.py | 3 +- pandas/tests/extension/base/io.py | 3 +- pandas/tests/extension/base/methods.py | 12 +- pandas/tests/extension/base/missing.py | 3 +- pandas/tests/extension/base/ops.py | 3 +- pandas/tests/extension/base/printing.py | 3 +- pandas/tests/extension/base/reduce.py | 3 +- pandas/tests/extension/base/reshaping.py | 3 +- pandas/tests/extension/base/setitem.py | 3 +- pandas/tests/extension/decimal/__init__.py | 7 +- pandas/tests/extension/decimal/array.py | 2 +- .../tests/extension/decimal/test_decimal.py | 8 +- pandas/tests/extension/json/__init__.py | 2 +- pandas/tests/extension/json/test_json.py | 3 +- pandas/tests/extension/list/__init__.py | 2 +- pandas/tests/extension/list/test_list.py | 3 +- pandas/tests/extension/test_categorical.py | 4 +- pandas/tests/extension/test_numpy.py | 3 +- pandas/tests/frame/indexing/test_indexing.py | 28 - pandas/tests/frame/methods/test_astype.py | 5 + pandas/tests/frame/methods/test_reindex.py | 31 +- .../tests/frame/methods/test_sort_values.py | 8 + pandas/tests/frame/test_api.py | 14 +- pandas/tests/frame/test_stack_unstack.py | 22 + pandas/tests/generic/test_frame.py | 3 +- pandas/tests/generic/test_series.py | 3 +- .../indexes/categorical/test_category.py | 3 +- pandas/tests/indexes/common.py | 60 +- pandas/tests/indexes/datetimelike.py | 3 +- .../indexes/datetimes/test_datetimelike.py | 3 +- pandas/tests/indexes/datetimes/test_ops.py | 3 - pandas/tests/indexes/multi/conftest.py | 7 +- pandas/tests/indexes/multi/test_compat.py | 39 - pandas/tests/indexes/numeric/test_indexing.py | 27 +- pandas/tests/indexes/period/test_ops.py | 2 - pandas/tests/indexes/period/test_period.py | 8 +- pandas/tests/indexes/ranges/test_range.py | 4 +- pandas/tests/indexes/test_any_index.py | 10 + pandas/tests/indexes/test_base.py | 18 +- pandas/tests/indexes/test_common.py | 107 +- pandas/tests/indexes/test_indexing.py | 63 +- pandas/tests/indexes/test_numeric.py | 13 +- pandas/tests/indexes/test_setops.py | 64 +- pandas/tests/indexes/timedeltas/test_ops.py | 2 - .../indexes/timedeltas/test_timedelta.py | 3 +- .../tests/indexing/multiindex/test_getitem.py | 79 + .../tests/indexing/multiindex/test_slice.py | 36 + pandas/tests/indexing/test_coercion.py | 29 +- pandas/tests/indexing/test_indexing.py | 3 +- pandas/tests/io/excel/test_odf.py | 8 - pandas/tests/io/excel/test_openpyxl.py | 67 + pandas/tests/io/excel/test_readers.py | 9 +- pandas/tests/io/excel/test_xlrd.py | 9 - pandas/tests/io/formats/test_style.py | 79 +- pandas/tests/io/parser/test_parse_dates.py | 37 + pandas/tests/io/test_stata.py | 45 + pandas/tests/libs/test_hashtable.py | 37 + pandas/tests/plotting/test_series.py | 19 + .../tests/resample/test_resampler_grouper.py | 12 +- pandas/tests/reshape/concat/test_append.py | 22 +- .../tests/reshape/concat/test_categorical.py | 1 + .../tests/scalar/timedelta/test_timedelta.py | 62 + pandas/tests/series/indexing/test_indexing.py | 34 +- pandas/tests/series/indexing/test_numeric.py | 25 +- pandas/tests/series/indexing/test_setitem.py | 159 +- pandas/tests/series/methods/test_astype.py | 5 + pandas/tests/series/methods/test_replace.py | 3 + .../offsets/test_offsets_properties.py | 2 + pandas/tests/tseries/offsets/test_ticks.py | 3 +- pandas/tests/util/test_assert_attr_equal.py | 30 + pandas/tests/window/test_api.py | 14 + pandas/tests/window/test_base_indexer.py | 5 +- pandas/tests/window/test_groupby.py | 10 + versioneer.py | 2 +- 158 files changed, 3638 insertions(+), 2738 deletions(-) create mode 100644 pandas/core/window/doc.py create mode 100644 pandas/tests/util/test_assert_attr_equal.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d0940ce8be992..0fc6e61049a44 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,10 +24,10 @@ repos: hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v2.7.4 + rev: v2.9.0 hooks: - id: pyupgrade - args: [--py37-plus] + args: [--py37-plus, --keep-runtime-typing] - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.7.0 hooks: @@ -192,6 +192,11 @@ repos: files: ^pandas/ exclude: ^pandas/tests/ - repo: https://github.com/MarcoGorelli/no-string-hints - rev: v0.1.6 + rev: v0.1.7 hooks: - id: no-string-hints +- repo: https://github.com/MarcoGorelli/abs-imports + rev: v0.1.2 + hooks: + - id: abs-imports + files: ^pandas/ diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9f5827eabee52..12de9b121ef6d 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -84,8 +84,8 @@ class ToCSVIndexes(BaseIO): def _create_df(rows, cols): index_cols = { "index1": np.random.randint(0, rows, rows), - "index2": np.full(rows, 1, dtype=np.int), - "index3": np.full(rows, 1, dtype=np.int), + "index2": np.full(rows, 1, dtype=int), + "index3": np.full(rows, 1, dtype=int), } data_cols = { f"col{i}": np.random.uniform(0, 100000.0, rows) for i in range(cols) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 5f8cdb2a0bdac..5738775fe2b27 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -140,8 +140,11 @@ class Pairwise: def setup(self, window, method, pairwise): N = 10 ** 4 + n_groups = 20 + groups = [i for _ in range(N // n_groups) for i in range(n_groups)] arr = np.random.random(N) self.df = pd.DataFrame(arr) + self.df_group = pd.DataFrame({"A": groups, "B": arr}).groupby("A") def time_pairwise(self, window, method, pairwise): if window is None: @@ -150,6 +153,13 @@ def time_pairwise(self, window, method, pairwise): r = self.df.rolling(window=window) getattr(r, method)(self.df, pairwise=pairwise) + def time_groupby(self, window, method, pairwise): + if window is None: + r = self.df_group.expanding() + else: + r = self.df_group.rolling(window=window) + getattr(r, method)(self.df, pairwise=pairwise) + class Quantile: params = ( diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 3214b21133b72..b457bce8fe138 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -108,8 +108,8 @@ def setup(self): self.vals_short = np.arange(2).astype(object) self.vals_long = np.arange(10 ** 5).astype(object) # because of nans floats are special: - self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float)).astype(object) - self.vals_long_floats = np.arange(10 ** 5, dtype=np.float).astype(object) + self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float_)).astype(object) + self.vals_long_floats = np.arange(10 ** 5, dtype=np.float_).astype(object) def time_isin_nans(self): # if nan-objects are different objects, diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 114b4688fffaf..1058a270a76ba 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -180,8 +180,7 @@ "\n", "styles = [\n", " hover(),\n", - " {'selector': \"th\", 'props': [(\"font-size\", \"150%\"),\n", - " (\"text-align\", \"center\")]}\n", + " {'selector': \"th\", 'props': [(\"font-size\", \"150%\"), (\"text-align\", \"center\")]}\n", "]\n", "\n", "df.style.set_table_styles(styles)" @@ -224,7 +223,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can also chain all of the above by setting the `overwrite` argument to `False` so that it preserves previous settings." + "We can also chain all of the above by setting the `overwrite` argument to `False` so that it preserves previous settings. We also show the CSS string input rather than the list of tuples." ] }, { @@ -238,13 +237,13 @@ " set_table_styles(styles).\\\n", " set_table_styles({\n", " 'A': [{'selector': '',\n", - " 'props': [('color', 'red')]}],\n", + " 'props': 'color:red;'}],\n", " 'B': [{'selector': 'td',\n", - " 'props': [('color', 'blue')]}]\n", + " 'props': 'color:blue;'}]\n", " }, axis=0, overwrite=False).\\\n", " set_table_styles({\n", " 3: [{'selector': 'td',\n", - " 'props': [('color', 'green')]}]\n", + " 'props': 'color:green;font-weight:bold;'}]\n", " }, axis=1, overwrite=False)\n", "s" ] diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index f2c7f139021a8..305ab23e23ed6 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -17,9 +17,12 @@ Fixed regressions - Fixed regression in :func:`read_excel` that caused it to raise ``AttributeError`` when checking version of older xlrd versions (:issue:`38955`) - Fixed regression in :class:`DataFrame` constructor reordering element when construction from datetime ndarray with dtype not ``"datetime64[ns]"`` (:issue:`39422`) +- Fixed regression in :class:`DataFrame.astype` and :class:`Series.astype` not casting to bytes dtype (:issue:`39474`) - Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`) - Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`) - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`) +- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`) +- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`) - .. --------------------------------------------------------------------------- @@ -30,8 +33,9 @@ Bug fixes ~~~~~~~~~ - :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`) -- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information (:issue:`38956`, :issue:`39001`) +- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information; the fix requires ``openpyxl`` >= 3.0.0, prior versions may still fail (:issue:`38956`, :issue:`39001`) - Fixed bug in :func:`pandas.read_excel` sometimes producing a ``DataFrame`` with trailing rows of ``np.nan`` when the engine ``openpyxl`` is used (:issue:`39181`) +- .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index f0680bebdafe5..17d8c79994dbe 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -53,8 +53,11 @@ Other enhancements - :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`Series.apply` can now accept list-like or dictionary-like arguments that aren't lists or dictionaries, e.g. ``ser.apply(np.array(["sum", "mean"]))``, which was already the case for :meth:`DataFrame.apply` (:issue:`39140`) - :meth:`DataFrame.plot.scatter` can now accept a categorical column as the argument to ``c`` (:issue:`12380`, :issue:`31357`) -- :meth:`.Styler.set_tooltips` allows on hover tooltips to be added to styled HTML dataframes. +- :meth:`.Styler.set_tooltips` allows on hover tooltips to be added to styled HTML dataframes (:issue:`35643`) +- :meth:`.Styler.set_tooltips_class` and :meth:`.Styler.set_table_styles` amended to optionally allow certain css-string input arguments (:issue:`39564`) - :meth:`Series.loc.__getitem__` and :meth:`Series.loc.__setitem__` with :class:`MultiIndex` now raising helpful error message when indexer has too many dimensions (:issue:`35349`) +- :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files. + .. --------------------------------------------------------------------------- @@ -248,7 +251,8 @@ Performance improvements - Performance improvement in :meth:`Series.mean` for nullable data types (:issue:`34814`) - Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`) - Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`) -- Performance improvement in :meth:`core.window.Rolling.corr` and :meth:`core.window.Rolling.cov` (:issue:`39388`) +- Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`) +- Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`) .. --------------------------------------------------------------------------- @@ -278,6 +282,7 @@ Datetimelike - Bug in :class:`Categorical` incorrectly typecasting ``datetime`` object to ``Timestamp`` (:issue:`38878`) - Bug in comparisons between :class:`Timestamp` object and ``datetime64`` objects just outside the implementation bounds for nanosecond ``datetime64`` (:issue:`39221`) - Bug in :meth:`Timestamp.round`, :meth:`Timestamp.floor`, :meth:`Timestamp.ceil` for values near the implementation bounds of :class:`Timestamp` (:issue:`39244`) +- Bug in :meth:`Timedelta.round`, :meth:`Timedelta.floor`, :meth:`Timedelta.ceil` for values near the implementation bounds of :class:`Timedelta` (:issue:`38964`) - Bug in :func:`date_range` incorrectly creating :class:`DatetimeIndex` containing ``NaT`` instead of raising ``OutOfBoundsDatetime`` in corner cases (:issue:`24124`) Timedelta @@ -336,8 +341,12 @@ Indexing - Bug in setting ``timedelta64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`) - Bug in setting :class:`Interval` values into a :class:`Series` or :class:`DataFrame` with mismatched :class:`IntervalDtype` incorrectly casting the new values to the existing dtype (:issue:`39120`) - Bug in setting ``datetime64`` values into a :class:`Series` with integer-dtype incorrect casting the datetime64 values to integers (:issue:`39266`) +- Bug in :meth:`Index.get_loc` not raising ``KeyError`` when method is specified for ``NaN`` value when ``NaN`` is not in :class:`Index` (:issue:`39382`) - Bug in incorrectly raising in :meth:`Index.insert`, when setting a new column that cannot be held in the existing ``frame.columns``, or in :meth:`Series.reset_index` or :meth:`DataFrame.reset_index` instead of casting to a compatible dtype (:issue:`39068`) - Bug in :meth:`RangeIndex.append` where a single object of length 1 was concatenated incorrectly (:issue:`39401`) +- Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) +- Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) +- Missing ^^^^^^^ @@ -401,6 +410,8 @@ Groupby/resample/rolling - Bug in :meth:`.Resampler.aggregate` and :meth:`DataFrame.transform` raising ``TypeError`` instead of ``SpecificationError`` when missing keys had mixed dtypes (:issue:`39025`) - Bug in :meth:`.DataFrameGroupBy.idxmin` and :meth:`.DataFrameGroupBy.idxmax` with ``ExtensionDtype`` columns (:issue:`38733`) - Bug in :meth:`Series.resample` would raise when the index was a :class:`PeriodIndex` consisting of ``NaT`` (:issue:`39227`) +- Bug in :meth:`core.window.rolling.RollingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.corr` where the groupby column would return 0 instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) +- Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) Reshaping ^^^^^^^^^ @@ -410,8 +421,10 @@ Reshaping - :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`) - Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`) - :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`) +- Bug in :meth:`DataFrame.stack` not handling ``NaN`` in :class:`MultiIndex` columns correct (:issue:`39481`) - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`) -- +- Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`) +- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) Sparse ^^^^^^ @@ -432,6 +445,10 @@ Other - Bug in :class:`Index` constructor sometimes silently ignorning a specified ``dtype`` (:issue:`38879`) - Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`) - Bug in :class:`Styler` which caused CSS to duplicate on multiple renders. (:issue:`39395`) +- :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`) +- Bug in :func:`pandas.testing.assert_series_equal`, :func:`pandas.testing.assert_frame_equal`, :func:`pandas.testing.assert_index_equal` and :func:`pandas.testing.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`) +- Bug in :class:`Styler` where ``subset`` arg in methods raised an error for some valid multiindex slices (:issue:`33562`) +- - .. --------------------------------------------------------------------------- diff --git a/pandas/__init__.py b/pandas/__init__.py index 2b64100075987..cc4c99efc4345 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -180,7 +180,7 @@ import pandas.arrays # use the closest tagged version if possible -from ._version import get_versions +from pandas._version import get_versions v = get_versions() __version__ = v.get("closest-tag", v["version"]) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 2c7780e0d95fd..3527fe2d8cd8d 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -19,6 +19,7 @@ from pandas._libs.khash cimport ( are_equivalent_float64_t, are_equivalent_khcomplex64_t, are_equivalent_khcomplex128_t, + kh_needed_n_buckets, kh_str_t, khcomplex64_t, khcomplex128_t, @@ -152,7 +153,7 @@ def unique_label_indices(const int64_t[:] labels): ndarray[int64_t, ndim=1] arr Int64VectorData *ud = idx.data - kh_resize_int64(table, min(n, SIZE_HINT_LIMIT)) + kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) with nogil: for i in range(n): diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index a3e72ed858392..0b6bb170cc531 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -392,9 +392,8 @@ cdef class {{name}}HashTable(HashTable): def __cinit__(self, int64_t size_hint=1): self.table = kh_init_{{dtype}}() - if size_hint is not None: - size_hint = min(size_hint, SIZE_HINT_LIMIT) - kh_resize_{{dtype}}(self.table, size_hint) + size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) + kh_resize_{{dtype}}(self.table, size_hint) def __len__(self) -> int: return self.table.size @@ -420,6 +419,15 @@ cdef class {{name}}HashTable(HashTable): sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs + def get_state(self): + """ returns infos about the state of the hashtable""" + return { + 'n_buckets' : self.table.n_buckets, + 'size' : self.table.size, + 'n_occupied' : self.table.n_occupied, + 'upper_bound' : self.table.upper_bound, + } + cpdef get_item(self, {{dtype}}_t val): cdef: khiter_t k @@ -731,9 +739,8 @@ cdef class StringHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_str() - if size_hint is not None: - size_hint = min(size_hint, SIZE_HINT_LIMIT) - kh_resize_str(self.table, size_hint) + size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) + kh_resize_str(self.table, size_hint) def __dealloc__(self): if self.table is not NULL: @@ -747,6 +754,15 @@ cdef class StringHashTable(HashTable): sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs + def get_state(self): + """ returns infos about the state of the hashtable""" + return { + 'n_buckets' : self.table.n_buckets, + 'size' : self.table.size, + 'n_occupied' : self.table.n_occupied, + 'upper_bound' : self.table.upper_bound, + } + cpdef get_item(self, str val): cdef: khiter_t k @@ -1044,9 +1060,8 @@ cdef class PyObjectHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_pymap() - if size_hint is not None: - size_hint = min(size_hint, SIZE_HINT_LIMIT) - kh_resize_pymap(self.table, size_hint) + size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) + kh_resize_pymap(self.table, size_hint) def __dealloc__(self): if self.table is not NULL: @@ -1072,6 +1087,18 @@ cdef class PyObjectHashTable(HashTable): sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs + def get_state(self): + """ + returns infos about the current state of the hashtable like size, + number of buckets and so on. + """ + return { + 'n_buckets' : self.table.n_buckets, + 'size' : self.table.size, + 'n_occupied' : self.table.n_occupied, + 'upper_bound' : self.table.upper_bound, + } + cpdef get_item(self, object val): cdef: khiter_t k diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 4684ecb8716c0..772d83e67394c 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -121,7 +121,7 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): kh_{{ttype}}_t *table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - kh_resize_{{ttype}}(table, min(n, SIZE_HINT_LIMIT)) + kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) if keep not in ('last', 'first', False): raise ValueError('keep must be either "first", "last" or False') diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 69680e472bbc2..e1ea1fbf9bd46 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -57,7 +57,14 @@ cdef class {{name}}Engine(IndexEngine): with warnings.catch_warnings(): # e.g. if values is float64 and `val` is a str, suppress warning warnings.filterwarnings("ignore", category=FutureWarning) + {{if name in {'Float64', 'Float32'} }} + if util.is_nan(val): + indexer = np.isnan(values) + else: + indexer = values == val + {{else}} indexer = values == val + {{endif}} except TypeError: # if the equality above returns a bool, cython will raise TypeError # when trying to cast it to ndarray diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 82b8ca4d443db..ba805e9ff1251 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -120,4 +120,7 @@ cdef extern from "khash_python.h": bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil + khuint_t kh_needed_n_buckets(khuint_t element_n) nogil + + include "khash_for_primitive_helper.pxi" diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index fc7a650eebba4..0073aaf0195c7 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -244,3 +244,13 @@ void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) { void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) { kh_resize_str(table->table, val); } + +// utility function: given the number of elements +// returns number of necessary buckets +khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){ + khuint_t candidate = n_elements; + kroundup32(candidate); + khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5); + return (upper_bound < n_elements) ? 2*candidate : candidate; + +} diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index f08b86aa63574..6135e54a4502e 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -27,18 +27,28 @@ "tz_compare", ] -from . import dtypes -from .conversion import OutOfBoundsTimedelta, localize_pydatetime -from .dtypes import Resolution -from .nattype import NaT, NaTType, iNaT, is_null_datetimelike, nat_strings -from .np_datetime import OutOfBoundsDatetime -from .offsets import BaseOffset, Tick, to_offset -from .period import IncompatibleFrequency, Period -from .timedeltas import Timedelta, delta_to_nanoseconds, ints_to_pytimedelta -from .timestamps import Timestamp -from .timezones import tz_compare -from .tzconversion import tz_convert_from_utc_single -from .vectorized import ( +from pandas._libs.tslibs import dtypes +from pandas._libs.tslibs.conversion import OutOfBoundsTimedelta, localize_pydatetime +from pandas._libs.tslibs.dtypes import Resolution +from pandas._libs.tslibs.nattype import ( + NaT, + NaTType, + iNaT, + is_null_datetimelike, + nat_strings, +) +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime +from pandas._libs.tslibs.offsets import BaseOffset, Tick, to_offset +from pandas._libs.tslibs.period import IncompatibleFrequency, Period +from pandas._libs.tslibs.timedeltas import ( + Timedelta, + delta_to_nanoseconds, + ints_to_pytimedelta, +) +from pandas._libs.tslibs.timestamps import Timestamp +from pandas._libs.tslibs.timezones import tz_compare +from pandas._libs.tslibs.tzconversion import tz_convert_from_utc_single +from pandas._libs.tslibs.vectorized import ( dt64arr_to_periodarr, get_resolution, ints_to_pydatetime, diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 57404b99c7628..2f25df9144f32 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -636,3 +636,154 @@ def get_locale_names(name_type: str, locale: object = None): """ with set_locale(locale, LC_TIME): return getattr(LocaleTime(), name_type) + + +# --------------------------------------------------------------------- +# Rounding + + +class RoundTo: + """ + enumeration defining the available rounding modes + + Attributes + ---------- + MINUS_INFTY + round towards -∞, or floor [2]_ + PLUS_INFTY + round towards +∞, or ceil [3]_ + NEAREST_HALF_EVEN + round to nearest, tie-break half to even [6]_ + NEAREST_HALF_MINUS_INFTY + round to nearest, tie-break half to -∞ [5]_ + NEAREST_HALF_PLUS_INFTY + round to nearest, tie-break half to +∞ [4]_ + + + References + ---------- + .. [1] "Rounding - Wikipedia" + https://en.wikipedia.org/wiki/Rounding + .. [2] "Rounding down" + https://en.wikipedia.org/wiki/Rounding#Rounding_down + .. [3] "Rounding up" + https://en.wikipedia.org/wiki/Rounding#Rounding_up + .. [4] "Round half up" + https://en.wikipedia.org/wiki/Rounding#Round_half_up + .. [5] "Round half down" + https://en.wikipedia.org/wiki/Rounding#Round_half_down + .. [6] "Round half to even" + https://en.wikipedia.org/wiki/Rounding#Round_half_to_even + """ + @property + def MINUS_INFTY(self) -> int: + return 0 + + @property + def PLUS_INFTY(self) -> int: + return 1 + + @property + def NEAREST_HALF_EVEN(self) -> int: + return 2 + + @property + def NEAREST_HALF_PLUS_INFTY(self) -> int: + return 3 + + @property + def NEAREST_HALF_MINUS_INFTY(self) -> int: + return 4 + + +cdef inline ndarray[int64_t] _floor_int64(int64_t[:] values, int64_t unit): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] result = np.empty(n, dtype="i8") + int64_t res, value + + with cython.overflowcheck(True): + for i in range(n): + value = values[i] + if value == NPY_NAT: + res = NPY_NAT + else: + res = value - value % unit + result[i] = res + + return result + + +cdef inline ndarray[int64_t] _ceil_int64(int64_t[:] values, int64_t unit): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] result = np.empty(n, dtype="i8") + int64_t res, value + + with cython.overflowcheck(True): + for i in range(n): + value = values[i] + + if value == NPY_NAT: + res = NPY_NAT + else: + remainder = value % unit + if remainder == 0: + res = value + else: + res = value + (unit - remainder) + + result[i] = res + + return result + + +cdef inline ndarray[int64_t] _rounddown_int64(values, int64_t unit): + return _ceil_int64(values - unit // 2, unit) + + +cdef inline ndarray[int64_t] _roundup_int64(values, int64_t unit): + return _floor_int64(values + unit // 2, unit) + + +def round_nsint64(values: np.ndarray, mode: RoundTo, nanos) -> np.ndarray: + """ + Applies rounding mode at given frequency + + Parameters + ---------- + values : np.ndarray[int64_t]` + mode : instance of `RoundTo` enumeration + nanos : np.int64 + Freq to round to, expressed in nanoseconds + + Returns + ------- + np.ndarray[int64_t] + """ + cdef: + int64_t unit = nanos + + if mode == RoundTo.MINUS_INFTY: + return _floor_int64(values, unit) + elif mode == RoundTo.PLUS_INFTY: + return _ceil_int64(values, unit) + elif mode == RoundTo.NEAREST_HALF_MINUS_INFTY: + return _rounddown_int64(values, unit) + elif mode == RoundTo.NEAREST_HALF_PLUS_INFTY: + return _roundup_int64(values, unit) + elif mode == RoundTo.NEAREST_HALF_EVEN: + # for odd unit there is no need of a tie break + if unit % 2: + return _rounddown_int64(values, unit) + quotient, remainder = np.divmod(values, unit) + mask = np.logical_or( + remainder > (unit // 2), + np.logical_and(remainder == (unit // 2), quotient % 2) + ) + quotient[mask] += 1 + return quotient * unit + + # if/elif above should catch all rounding modes defined in enum 'RoundTo': + # if flow of control arrives here, it is a bug + raise ValueError("round_nsint64 called with an unrecognized rounding mode") diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 25991cfbdb7a7..748a4c27e64ad 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -47,6 +47,7 @@ from pandas._libs.tslibs.util cimport ( is_integer_object, is_timedelta64_object, ) +from pandas._libs.tslibs.fields import RoundTo, round_nsint64 # ---------------------------------------------------------------------- # Constants @@ -1297,14 +1298,18 @@ class Timedelta(_Timedelta): object_state = self.value, return (Timedelta, object_state) - def _round(self, freq, rounder): + @cython.cdivision(True) + def _round(self, freq, mode): cdef: - int64_t result, unit + int64_t result, unit, remainder + ndarray[int64_t] arr from pandas._libs.tslibs.offsets import to_offset unit = to_offset(freq).nanos - result = unit * rounder(self.value / float(unit)) - return Timedelta(result, unit='ns') + + arr = np.array([self.value], dtype="i8") + result = round_nsint64(arr, mode, unit)[0] + return Timedelta(result, unit="ns") def round(self, freq): """ @@ -1323,7 +1328,7 @@ class Timedelta(_Timedelta): ------ ValueError if the freq cannot be converted """ - return self._round(freq, np.round) + return self._round(freq, RoundTo.NEAREST_HALF_EVEN) def floor(self, freq): """ @@ -1334,7 +1339,7 @@ class Timedelta(_Timedelta): freq : str Frequency string indicating the flooring resolution. """ - return self._round(freq, np.floor) + return self._round(freq, RoundTo.MINUS_INFTY) def ceil(self, freq): """ @@ -1345,7 +1350,7 @@ class Timedelta(_Timedelta): freq : str Frequency string indicating the ceiling resolution. """ - return self._round(freq, np.ceil) + return self._round(freq, RoundTo.PLUS_INFTY) # ---------------------------------------------------------------- # Arithmetic Methods diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 1df589073a6ba..5f6b614ac3d81 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -56,7 +56,12 @@ from pandas._libs.tslibs.util cimport ( is_timedelta64_object, ) -from pandas._libs.tslibs.fields import get_date_name_field, get_start_end_field +from pandas._libs.tslibs.fields import ( + RoundTo, + get_date_name_field, + get_start_end_field, + round_nsint64, +) from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( @@ -110,152 +115,6 @@ cdef inline object create_timestamp_from_ts(int64_t value, return ts_base -class RoundTo: - """ - enumeration defining the available rounding modes - - Attributes - ---------- - MINUS_INFTY - round towards -∞, or floor [2]_ - PLUS_INFTY - round towards +∞, or ceil [3]_ - NEAREST_HALF_EVEN - round to nearest, tie-break half to even [6]_ - NEAREST_HALF_MINUS_INFTY - round to nearest, tie-break half to -∞ [5]_ - NEAREST_HALF_PLUS_INFTY - round to nearest, tie-break half to +∞ [4]_ - - - References - ---------- - .. [1] "Rounding - Wikipedia" - https://en.wikipedia.org/wiki/Rounding - .. [2] "Rounding down" - https://en.wikipedia.org/wiki/Rounding#Rounding_down - .. [3] "Rounding up" - https://en.wikipedia.org/wiki/Rounding#Rounding_up - .. [4] "Round half up" - https://en.wikipedia.org/wiki/Rounding#Round_half_up - .. [5] "Round half down" - https://en.wikipedia.org/wiki/Rounding#Round_half_down - .. [6] "Round half to even" - https://en.wikipedia.org/wiki/Rounding#Round_half_to_even - """ - @property - def MINUS_INFTY(self) -> int: - return 0 - - @property - def PLUS_INFTY(self) -> int: - return 1 - - @property - def NEAREST_HALF_EVEN(self) -> int: - return 2 - - @property - def NEAREST_HALF_PLUS_INFTY(self) -> int: - return 3 - - @property - def NEAREST_HALF_MINUS_INFTY(self) -> int: - return 4 - - -cdef inline ndarray[int64_t] _floor_int64(int64_t[:] values, int64_t unit): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] result = np.empty(n, dtype="i8") - int64_t res, value - - with cython.overflowcheck(True): - for i in range(n): - value = values[i] - if value == NPY_NAT: - res = NPY_NAT - else: - res = value - value % unit - result[i] = res - - return result - - -cdef inline ndarray[int64_t] _ceil_int64(int64_t[:] values, int64_t unit): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] result = np.empty(n, dtype="i8") - int64_t res, value - - with cython.overflowcheck(True): - for i in range(n): - value = values[i] - - if value == NPY_NAT: - res = NPY_NAT - else: - remainder = value % unit - if remainder == 0: - res = value - else: - res = value + (unit - remainder) - - result[i] = res - - return result - - -cdef inline ndarray[int64_t] _rounddown_int64(values, int64_t unit): - return _ceil_int64(values - unit//2, unit) - - -cdef inline ndarray[int64_t] _roundup_int64(values, int64_t unit): - return _floor_int64(values + unit//2, unit) - - -def round_nsint64(values: np.ndarray, mode: RoundTo, freq) -> np.ndarray: - """ - Applies rounding mode at given frequency - - Parameters - ---------- - values : np.ndarray[int64_t]` - mode : instance of `RoundTo` enumeration - freq : str, obj - - Returns - ------- - np.ndarray[int64_t] - """ - - unit = to_offset(freq).nanos - - if mode == RoundTo.MINUS_INFTY: - return _floor_int64(values, unit) - elif mode == RoundTo.PLUS_INFTY: - return _ceil_int64(values, unit) - elif mode == RoundTo.NEAREST_HALF_MINUS_INFTY: - return _rounddown_int64(values, unit) - elif mode == RoundTo.NEAREST_HALF_PLUS_INFTY: - return _roundup_int64(values, unit) - elif mode == RoundTo.NEAREST_HALF_EVEN: - # for odd unit there is no need of a tie break - if unit % 2: - return _rounddown_int64(values, unit) - quotient, remainder = np.divmod(values, unit) - mask = np.logical_or( - remainder > (unit // 2), - np.logical_and(remainder == (unit // 2), quotient % 2) - ) - quotient[mask] += 1 - return quotient * unit - - # if/elif above should catch all rounding modes defined in enum 'RoundTo': - # if flow of control arrives here, it is a bug - raise ValueError("round_nsint64 called with an unrecognized rounding mode") - - # ---------------------------------------------------------------------- def integer_op_not_supported(obj): @@ -1181,6 +1040,9 @@ class Timestamp(_Timestamp): return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, freq, ts.fold) def _round(self, freq, mode, ambiguous='raise', nonexistent='raise'): + cdef: + int64_t nanos = to_offset(freq).nanos + if self.tz is not None: value = self.tz_localize(None).value else: @@ -1189,7 +1051,7 @@ class Timestamp(_Timestamp): value = np.array([value], dtype=np.int64) # Will only ever contain 1 element for timestamp - r = round_nsint64(value, mode, freq)[0] + r = round_nsint64(value, mode, nanos)[0] result = Timestamp(r, unit='ns') if self.tz is not None: result = result.tz_localize( diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 494d9ac60dd96..024bfb02fe09d 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -459,13 +459,24 @@ def assert_attr_equal(attr: str, left, right, obj: str = "Attributes"): ): # np.nan return True + elif ( + isinstance(left_attr, (np.datetime64, np.timedelta64)) + and isinstance(right_attr, (np.datetime64, np.timedelta64)) + and type(left_attr) is type(right_attr) + and np.isnat(left_attr) + and np.isnat(right_attr) + ): + # np.datetime64("nat") or np.timedelta64("nat") + return True try: result = left_attr == right_attr except TypeError: # datetimetz on rhs may raise TypeError result = False - if not isinstance(result, bool): + if (left_attr is pd.NA) ^ (right_attr is pd.NA): + result = False + elif not isinstance(result, bool): result = result.all() if result: diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 35c7b6547431f..bcad9f1ddab09 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -17,7 +17,7 @@ "matplotlib": "2.2.3", "numexpr": "2.6.8", "odfpy": "1.3.0", - "openpyxl": "2.5.7", + "openpyxl": "2.6.0", "pandas_gbq": "0.12.0", "pyarrow": "0.15.0", "pytest": "5.0.1", diff --git a/pandas/conftest.py b/pandas/conftest.py index 0734cf12cce0d..829ac64884dac 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -480,13 +480,41 @@ def index(request): index_fixture2 = index -@pytest.fixture(params=indices_dict.keys()) +@pytest.fixture( + params=[ + key for key in indices_dict if not isinstance(indices_dict[key], MultiIndex) + ] +) +def index_flat(request): + """ + index fixture, but excluding MultiIndex cases. + """ + key = request.param + return indices_dict[key].copy() + + +# Alias so we can test with cartesian product of index_flat +index_flat2 = index_flat + + +@pytest.fixture( + params=[ + key + for key in indices_dict + if key not in ["int", "uint", "range", "empty", "repeats"] + and not isinstance(indices_dict[key], MultiIndex) + ] +) def index_with_missing(request): """ - Fixture for indices with missing values + Fixture for indices with missing values. + + Integer-dtype and empty cases are excluded because they cannot hold missing + values. + + MultiIndex is excluded because isna() is not defined for MultiIndex. """ - if request.param in ["int", "uint", "range", "empty", "repeats"]: - pytest.skip("missing values not supported") + # GH 35538. Use deep copy to avoid illusive bug on np-dev # Azure pipeline that writes into indices_dict despite copy ind = indices_dict[request.param].copy(deep=True) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 5c99f783c70d9..e3f159346cd51 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -27,18 +27,16 @@ AggFuncType, AggFuncTypeBase, AggFuncTypeDict, - AggObjType, Axis, FrameOrSeries, FrameOrSeriesUnion, ) -from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import is_dict_like, is_list_like -from pandas.core.dtypes.generic import ABCDataFrame, ABCNDFrame, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.algorithms import safe_sort -from pandas.core.base import DataError, SpecificationError +from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.indexes.api import Index @@ -532,215 +530,3 @@ def transform_str_or_callable( return obj.apply(func, args=args, **kwargs) except Exception: return func(obj, *args, **kwargs) - - -def agg_list_like( - obj: AggObjType, - arg: List[AggFuncTypeBase], - _axis: int, -) -> FrameOrSeriesUnion: - """ - Compute aggregation in the case of a list-like argument. - - Parameters - ---------- - obj : Pandas object to compute aggregation on. - arg : list - Aggregations to compute. - _axis : int, 0 or 1 - Axis to compute aggregation on. - - Returns - ------- - Result of aggregation. - """ - from pandas.core.reshape.concat import concat - - if _axis != 0: - raise NotImplementedError("axis other than 0 is not supported") - - if obj._selected_obj.ndim == 1: - selected_obj = obj._selected_obj - else: - selected_obj = obj._obj_with_exclusions - - results = [] - keys = [] - - # degenerate case - if selected_obj.ndim == 1: - for a in arg: - colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) - try: - new_res = colg.aggregate(a) - - except TypeError: - pass - else: - results.append(new_res) - - # make sure we find a good name - name = com.get_callable_name(a) or a - keys.append(name) - - # multiples - else: - for index, col in enumerate(selected_obj): - colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) - try: - new_res = colg.aggregate(arg) - except (TypeError, DataError): - pass - except ValueError as err: - # cannot aggregate - if "Must produce aggregated value" in str(err): - # raised directly in _aggregate_named - pass - elif "no results" in str(err): - # raised directly in _aggregate_multiple_funcs - pass - else: - raise - else: - results.append(new_res) - keys.append(col) - - # if we are empty - if not len(results): - raise ValueError("no results") - - try: - return concat(results, keys=keys, axis=1, sort=False) - except TypeError as err: - - # we are concatting non-NDFrame objects, - # e.g. a list of scalars - - from pandas import Series - - result = Series(results, index=keys, name=obj.name) - if is_nested_object(result): - raise ValueError( - "cannot combine transform and aggregation operations" - ) from err - return result - - -def agg_dict_like( - obj: AggObjType, - arg: AggFuncTypeDict, - _axis: int, -) -> FrameOrSeriesUnion: - """ - Compute aggregation in the case of a dict-like argument. - - Parameters - ---------- - obj : Pandas object to compute aggregation on. - arg : dict - label-aggregation pairs to compute. - _axis : int, 0 or 1 - Axis to compute aggregation on. - - Returns - ------- - Result of aggregation. - """ - is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) - - if _axis != 0: # pragma: no cover - raise ValueError("Can only pass dict with axis=0") - - selected_obj = obj._selected_obj - - # if we have a dict of any non-scalars - # eg. {'A' : ['mean']}, normalize all to - # be list-likes - # Cannot use arg.values() because arg may be a Series - if any(is_aggregator(x) for _, x in arg.items()): - new_arg: AggFuncTypeDict = {} - for k, v in arg.items(): - if not isinstance(v, (tuple, list, dict)): - new_arg[k] = [v] - else: - new_arg[k] = v - - # the keys must be in the columns - # for ndim=2, or renamers for ndim=1 - - # ok for now, but deprecated - # {'A': { 'ra': 'mean' }} - # {'A': { 'ra': ['mean'] }} - # {'ra': ['mean']} - - # not ok - # {'ra' : { 'A' : 'mean' }} - if isinstance(v, dict): - raise SpecificationError("nested renamer is not supported") - elif isinstance(selected_obj, ABCSeries): - raise SpecificationError("nested renamer is not supported") - elif ( - isinstance(selected_obj, ABCDataFrame) and k not in selected_obj.columns - ): - raise KeyError(f"Column '{k}' does not exist!") - - arg = new_arg - - else: - # deprecation of renaming keys - # GH 15931 - keys = list(arg.keys()) - if isinstance(selected_obj, ABCDataFrame) and len( - selected_obj.columns.intersection(keys) - ) != len(keys): - cols = list( - safe_sort( - list(set(keys) - set(selected_obj.columns.intersection(keys))), - ) - ) - raise SpecificationError(f"Column(s) {cols} do not exist") - - from pandas.core.reshape.concat import concat - - if selected_obj.ndim == 1: - # key only used for output - colg = obj._gotitem(obj._selection, ndim=1) - results = {key: colg.agg(how) for key, how in arg.items()} - else: - # key used for column selection and output - results = {key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()} - - # set the final keys - keys = list(arg.keys()) - - # Avoid making two isinstance calls in all and any below - is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] - - # combine results - if all(is_ndframe): - keys_to_use = [k for k in keys if not results[k].empty] - # Have to check, if at least one DataFrame is not empty. - keys_to_use = keys_to_use if keys_to_use != [] else keys - axis = 0 if isinstance(obj, ABCSeries) else 1 - result = concat({k: results[k] for k in keys_to_use}, axis=axis) - elif any(is_ndframe): - # There is a mix of NDFrames and scalars - raise ValueError( - "cannot perform both aggregation " - "and transformation operations " - "simultaneously" - ) - else: - from pandas import Series - - # we have a dict of scalars - # GH 36212 use name only if obj is a series - if obj.ndim == 1: - obj = cast("Series", obj) - name = obj.name - else: - name = None - - result = Series(results, name=name) - - return result diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 51920f1613c12..2b0d3f5aa8862 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -715,6 +715,8 @@ def factorize( values, dtype = _ensure_data(values) if original.dtype.kind in ["m", "M"]: + # Note: factorize_array will cast NaT bc it has a __int__ + # method, but will not cast the more-correct dtype.type("nat") na_value = iNaT else: na_value = None @@ -1658,18 +1660,6 @@ def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None) return result -# TODO: can we de-duplicate with something in dtypes.missing? -def _get_default_fill_value(dtype, fill_value): - if fill_value is lib.no_default: - if is_extension_array_dtype(dtype): - fill_value = dtype.na_value - elif dtype.kind in ["m", "M"]: - fill_value = dtype.type("NaT") - else: - fill_value = np.nan - return fill_value - - def take_nd( arr, indexer, @@ -1711,7 +1701,8 @@ def take_nd( """ mask_info = None - fill_value = _get_default_fill_value(arr.dtype, fill_value) + if fill_value is lib.no_default: + fill_value = na_value_for_dtype(arr.dtype, compat=False) if isinstance(arr, ABCExtensionArray): # Check for EA to catch DatetimeArray, TimedeltaArray diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 8207f4d6e33d4..533190e692891 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -30,15 +30,18 @@ ) from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import ( is_dict_like, is_extension_array_dtype, is_list_like, is_sequence, ) -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCNDFrame, ABCSeries -from pandas.core.aggregation import agg_dict_like, agg_list_like +from pandas.core.algorithms import safe_sort +from pandas.core.base import DataError, SpecificationError +import pandas.core.common as com from pandas.core.construction import ( array as pd_array, create_series_with_explicit_dtype, @@ -171,12 +174,10 @@ def agg(self) -> Tuple[Optional[FrameOrSeriesUnion], Optional[bool]]: return result, None if is_dict_like(arg): - arg = cast(AggFuncTypeDict, arg) - return agg_dict_like(obj, arg, _axis), True + return self.agg_dict_like(_axis), True elif is_list_like(arg): # we require a list, but not a 'str' - arg = cast(List[AggFuncTypeBase], arg) - return agg_list_like(obj, arg, _axis=_axis), None + return self.agg_list_like(_axis=_axis), None else: result = None @@ -188,6 +189,211 @@ def agg(self) -> Tuple[Optional[FrameOrSeriesUnion], Optional[bool]]: # caller can react return result, True + def agg_list_like(self, _axis: int) -> FrameOrSeriesUnion: + """ + Compute aggregation in the case of a list-like argument. + + Parameters + ---------- + _axis : int, 0 or 1 + Axis to compute aggregation on. + + Returns + ------- + Result of aggregation. + """ + from pandas.core.reshape.concat import concat + + obj = self.obj + arg = cast(List[AggFuncTypeBase], self.f) + + if _axis != 0: + raise NotImplementedError("axis other than 0 is not supported") + + if obj._selected_obj.ndim == 1: + selected_obj = obj._selected_obj + else: + selected_obj = obj._obj_with_exclusions + + results = [] + keys = [] + + # degenerate case + if selected_obj.ndim == 1: + for a in arg: + colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) + try: + new_res = colg.aggregate(a) + + except TypeError: + pass + else: + results.append(new_res) + + # make sure we find a good name + name = com.get_callable_name(a) or a + keys.append(name) + + # multiples + else: + for index, col in enumerate(selected_obj): + colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) + try: + new_res = colg.aggregate(arg) + except (TypeError, DataError): + pass + except ValueError as err: + # cannot aggregate + if "Must produce aggregated value" in str(err): + # raised directly in _aggregate_named + pass + elif "no results" in str(err): + # raised directly in _aggregate_multiple_funcs + pass + else: + raise + else: + results.append(new_res) + keys.append(col) + + # if we are empty + if not len(results): + raise ValueError("no results") + + try: + return concat(results, keys=keys, axis=1, sort=False) + except TypeError as err: + + # we are concatting non-NDFrame objects, + # e.g. a list of scalars + + from pandas import Series + + result = Series(results, index=keys, name=obj.name) + if is_nested_object(result): + raise ValueError( + "cannot combine transform and aggregation operations" + ) from err + return result + + def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion: + """ + Compute aggregation in the case of a dict-like argument. + + Parameters + ---------- + _axis : int, 0 or 1 + Axis to compute aggregation on. + + Returns + ------- + Result of aggregation. + """ + obj = self.obj + arg = cast(AggFuncTypeDict, self.f) + + is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) + + if _axis != 0: # pragma: no cover + raise ValueError("Can only pass dict with axis=0") + + selected_obj = obj._selected_obj + + # if we have a dict of any non-scalars + # eg. {'A' : ['mean']}, normalize all to + # be list-likes + # Cannot use arg.values() because arg may be a Series + if any(is_aggregator(x) for _, x in arg.items()): + new_arg: AggFuncTypeDict = {} + for k, v in arg.items(): + if not isinstance(v, (tuple, list, dict)): + new_arg[k] = [v] + else: + new_arg[k] = v + + # the keys must be in the columns + # for ndim=2, or renamers for ndim=1 + + # ok for now, but deprecated + # {'A': { 'ra': 'mean' }} + # {'A': { 'ra': ['mean'] }} + # {'ra': ['mean']} + + # not ok + # {'ra' : { 'A' : 'mean' }} + if isinstance(v, dict): + raise SpecificationError("nested renamer is not supported") + elif isinstance(selected_obj, ABCSeries): + raise SpecificationError("nested renamer is not supported") + elif ( + isinstance(selected_obj, ABCDataFrame) + and k not in selected_obj.columns + ): + raise KeyError(f"Column '{k}' does not exist!") + + arg = new_arg + + else: + # deprecation of renaming keys + # GH 15931 + keys = list(arg.keys()) + if isinstance(selected_obj, ABCDataFrame) and len( + selected_obj.columns.intersection(keys) + ) != len(keys): + cols = list( + safe_sort( + list(set(keys) - set(selected_obj.columns.intersection(keys))), + ) + ) + raise SpecificationError(f"Column(s) {cols} do not exist") + + from pandas.core.reshape.concat import concat + + if selected_obj.ndim == 1: + # key only used for output + colg = obj._gotitem(obj._selection, ndim=1) + results = {key: colg.agg(how) for key, how in arg.items()} + else: + # key used for column selection and output + results = { + key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() + } + + # set the final keys + keys = list(arg.keys()) + + # Avoid making two isinstance calls in all and any below + is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] + + # combine results + if all(is_ndframe): + keys_to_use = [k for k in keys if not results[k].empty] + # Have to check, if at least one DataFrame is not empty. + keys_to_use = keys_to_use if keys_to_use != [] else keys + axis = 0 if isinstance(obj, ABCSeries) else 1 + result = concat({k: results[k] for k in keys_to_use}, axis=axis) + elif any(is_ndframe): + # There is a mix of NDFrames and scalars + raise ValueError( + "cannot perform both aggregation " + "and transformation operations " + "simultaneously" + ) + else: + from pandas import Series + + # we have a dict of scalars + # GH 36212 use name only if obj is a series + if obj.ndim == 1: + obj = cast("Series", obj) + name = obj.name + else: + name = None + + result = Series(results, name=name) + + return result + def maybe_apply_str(self) -> Optional[FrameOrSeriesUnion]: """ Compute apply in case of a string. diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 1cac825cc0898..d0565dfff0eb1 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -3,7 +3,7 @@ """ import operator import re -from typing import Optional, Pattern, Union +from typing import Any, Optional, Pattern, Union import numpy as np @@ -13,13 +13,28 @@ is_datetimelike_v_numeric, is_numeric_v_string_like, is_re, + is_re_compilable, is_scalar, ) from pandas.core.dtypes.missing import isna +def should_use_regex(regex: bool, to_replace: Any) -> bool: + """ + Decide whether to treat `to_replace` as a regular expression. + """ + if is_re(to_replace): + regex = True + + regex = regex and is_re_compilable(to_replace) + + # Don't use regex if the pattern is empty. + regex = regex and re.compile(to_replace).pattern != "" + return regex + + def compare_or_regex_search( - a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: ArrayLike + a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: np.ndarray ) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -32,12 +47,14 @@ def compare_or_regex_search( a : array_like b : scalar or regex pattern regex : bool - mask : array_like + mask : np.ndarray[bool] Returns ------- mask : array_like of bool """ + if isna(b): + return ~mask def _check_comparison_types( result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 06b46c50e9467..eb7c9e69d962b 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -373,13 +373,13 @@ def delete(self: NDArrayBackedExtensionArrayT, loc) -> NDArrayBackedExtensionArr # These are not part of the EA API, but we implement them because # pandas assumes they're there. - def value_counts(self, dropna: bool = False): + def value_counts(self, dropna: bool = True): """ Return a Series containing counts of unique values. Parameters ---------- - dropna : bool, default False + dropna : bool, default True Don't include counts of NA values. Returns diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index dd281a39907fd..86eafb34e847f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -23,8 +23,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import ops - -from .masked import BaseMaskedArray, BaseMaskedDtype +from pandas.core.arrays.masked import BaseMaskedArray, BaseMaskedDtype if TYPE_CHECKING: import pyarrow diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 48316373a1140..af78b84923a9c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1416,7 +1416,7 @@ def notna(self): notnull = notna - def value_counts(self, dropna=True): + def value_counts(self, dropna: bool = True): """ Return a Series containing counts of each category. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1032559766ada..5ee7a5715d6af 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -32,11 +32,8 @@ iNaT, to_offset, ) -from pandas._libs.tslibs.timestamps import ( - RoundTo, - integer_op_not_supported, - round_nsint64, -) +from pandas._libs.tslibs.fields import RoundTo, round_nsint64 +from pandas._libs.tslibs.timestamps import integer_op_not_supported from pandas._typing import DatetimeLikeScalar, Dtype, DtypeObj, NpDtype from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, NullFrequencyError, PerformanceWarning @@ -1606,7 +1603,8 @@ def _round(self, freq, mode, ambiguous, nonexistent): ) values = self.view("i8") - result = round_nsint64(values, mode, freq) + nanos = to_offset(freq).nanos + result = round_nsint64(values, mode, nanos) result = self._maybe_mask_results(result, fill_value=iNaT) return self._simple_new(result, dtype=self.dtype) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 2c3b3d3c2f0b4..bc8f2af4f3801 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -23,11 +23,10 @@ from pandas.core.dtypes.dtypes import ExtensionDtype, register_extension_dtype from pandas.core.dtypes.missing import isna +from pandas.core.arrays.numeric import NumericArray, NumericDtype from pandas.core.ops import invalid_comparison from pandas.core.tools.numeric import to_numeric -from .numeric import NumericArray, NumericDtype - class FloatingDtype(NumericDtype): """ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index ff1af80f81ac6..363832ec89240 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -23,12 +23,11 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays.masked import BaseMaskedArray, BaseMaskedDtype +from pandas.core.arrays.numeric import NumericArray, NumericDtype from pandas.core.ops import invalid_comparison from pandas.core.tools.numeric import to_numeric -from .masked import BaseMaskedArray, BaseMaskedDtype -from .numeric import NumericArray, NumericDtype - class _IntegerDtype(NumericDtype): """ diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 882ca0955bc99..f4db68a2d7ac5 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1021,7 +1021,7 @@ def _validate_setitem_value(self, value): raise ValueError("Cannot set float NaN to integer-backed IntervalArray") return value_left, value_right - def value_counts(self, dropna=True): + def value_counts(self, dropna: bool = True): """ Returns a Series containing counts of each interval. @@ -1413,6 +1413,16 @@ def to_tuples(self, na_tuple=True): # --------------------------------------------------------------------- + def putmask(self, mask: np.ndarray, value) -> None: + value_left, value_right = self._validate_setitem_value(value) + + if isinstance(self._left, np.ndarray): + np.putmask(self._left, mask, value_left) + np.putmask(self._right, mask, value_right) + else: + self._left.putmask(mask, value_left) + self._right.putmask(mask, value_right) + def delete(self: IntervalArrayT, loc) -> IntervalArrayT: if isinstance(self._left, np.ndarray): new_left = np.delete(self._left, loc) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 49f0d7e66c005..69499bc7e4a77 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -18,8 +18,7 @@ ) from pandas.core import ops - -from .masked import BaseMaskedArray, BaseMaskedDtype +from pandas.core.arrays.masked import BaseMaskedArray, BaseMaskedDtype if TYPE_CHECKING: import pyarrow diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 2c69096e56973..4f68ed3d9a79d 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -382,6 +382,8 @@ def __init__( stacklevel=2, ) data = np.asarray(data, dtype="datetime64[ns]") + if fill_value is NaT: + fill_value = np.datetime64("NaT", "ns") data = np.asarray(data) sparse_values, sparse_index, fill_value = make_sparse( data, kind=kind, fill_value=fill_value, dtype=dtype @@ -723,13 +725,13 @@ def factorize(self, na_sentinel=-1): uniques = SparseArray(uniques, dtype=self.dtype) return codes, uniques - def value_counts(self, dropna=True): + def value_counts(self, dropna: bool = True): """ Returns a Series containing counts of unique values. Parameters ---------- - dropna : boolean, default True + dropna : bool, default True Don't include counts of NaN, even if NaN is in sp_values. Returns diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2e4580207bc8a..65618ce32b6d7 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -338,7 +338,7 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: ) return self._wrap_reduction_result(axis, result) - def value_counts(self, dropna=False): + def value_counts(self, dropna: bool = True): from pandas import value_counts return value_counts(self._ndarray, dropna=dropna).astype("Int64") diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index e9160c92435a4..480aaf3d48f62 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -419,7 +419,7 @@ def _add_period(self, other: Period): Add a Period object. """ # We will wrap in a PeriodArray and defer to the reversed operation - from .period import PeriodArray + from pandas.core.arrays.period import PeriodArray i8vals = np.broadcast_to(other.ordinal, self.shape) oth = PeriodArray(i8vals, freq=other.freq) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0be3970159fbd..1cb592f18dd2c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1221,7 +1221,7 @@ def soft_convert_objects( values = lib.maybe_convert_objects( values, convert_datetime=datetime, convert_timedelta=timedelta ) - except OutOfBoundsDatetime: + except (OutOfBoundsDatetime, ValueError): return values if numeric and is_object_dtype(values.dtype): @@ -1904,12 +1904,15 @@ def validate_numeric_casting(dtype: np.dtype, value: Scalar) -> None: ): raise ValueError("Cannot assign nan to integer series") - if dtype.kind in ["i", "u", "f", "c"]: + elif dtype.kind in ["i", "u", "f", "c"]: if is_bool(value) or isinstance(value, np.timedelta64): # numpy will cast td64 to integer if we're not careful raise ValueError( f"Cannot assign {type(value).__name__} to float/integer series" ) + elif dtype.kind == "b": + if is_scalar(value) and not is_bool(value): + raise ValueError(f"Cannot assign {type(value).__name__} to bool series") def can_hold_element(dtype: np.dtype, element: Any) -> bool: diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 624e71a5cf760..5b46bee96d4b3 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -1,7 +1,7 @@ """ Utility functions related to concat. """ -from typing import Set, cast +from typing import cast import numpy as np @@ -14,49 +14,13 @@ is_extension_array_dtype, is_sparse, ) -from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries +from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCSeries from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseArray from pandas.core.construction import array, ensure_wrapped_if_datetimelike -def _get_dtype_kinds(arrays) -> Set[str]: - """ - Parameters - ---------- - arrays : list of arrays - - Returns - ------- - set[str] - A set of kinds that exist in this list of arrays. - """ - typs: Set[str] = set() - for arr in arrays: - # Note: we use dtype.kind checks because they are much more performant - # than is_foo_dtype - - dtype = arr.dtype - if not isinstance(dtype, np.dtype): - # ExtensionDtype so we get - # e.g. "categorical", "datetime64[ns, US/Central]", "Sparse[itn64, 0]" - typ = str(dtype) - elif isinstance(arr, ABCRangeIndex): - typ = "range" - elif dtype.kind == "M": - typ = "datetime" - elif dtype.kind == "m": - typ = "timedelta" - elif dtype.kind in ["O", "b"]: - typ = str(dtype) # i.e. "object", "bool" - else: - typ = dtype.kind - - typs.add(typ) - return typs - - def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ Helper function for `arr.astype(common_dtype)` but handling all special @@ -130,8 +94,7 @@ def is_nonempty(x) -> bool: if non_empties and axis == 0: to_concat = non_empties - typs = _get_dtype_kinds(to_concat) - _contains_datetime = any(typ.startswith("datetime") for typ in typs) + kinds = {obj.dtype.kind for obj in to_concat} all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 @@ -150,17 +113,16 @@ def is_nonempty(x) -> bool: else: return np.concatenate(to_concat) - elif _contains_datetime or "timedelta" in typs: + elif any(kind in ["m", "M"] for kind in kinds): return _concat_datetime(to_concat, axis=axis) elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise # cast this to float) - typs = _get_dtype_kinds(to_concat) - if len(typs) != 1: + if len(kinds) != 1: - if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}): + if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}): # let numpy coerce pass else: diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f0455c01fa085..0db0b1f6a97ef 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -559,14 +559,14 @@ def na_value_for_dtype(dtype, compat: bool = True): >>> na_value_for_dtype(np.dtype('bool')) False >>> na_value_for_dtype(np.dtype('datetime64[ns]')) - NaT + numpy.datetime64('NaT') """ dtype = pandas_dtype(dtype) if is_extension_array_dtype(dtype): return dtype.na_value - if needs_i8_conversion(dtype): - return NaT + elif needs_i8_conversion(dtype): + return dtype.type("NaT", "ns") elif is_float_dtype(dtype): return np.nan elif is_integer_dtype(dtype): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7bcc6523bb2be..6357b8feb348b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5465,15 +5465,13 @@ def duplicated( 4 True dtype: bool """ - from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64 + from pandas._libs.hashtable import duplicated_int64 if self.empty: return self._constructor_sliced(dtype=bool) def f(vals): - labels, shape = algorithms.factorize( - vals, size_hint=min(len(self), SIZE_HINT_LIMIT) - ) + labels, shape = algorithms.factorize(vals, size_hint=len(self)) return labels.astype("i8", copy=False), len(shape) if subset is None: @@ -5562,7 +5560,9 @@ def sort_values( # type: ignore[override] ) if ignore_index: - new_data.set_axis(1, ibase.default_index(len(indexer))) + new_data.set_axis( + self._get_block_manager_axis(axis), ibase.default_index(len(indexer)) + ) result = self._constructor(new_data) if inplace: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a98ef15696339..12698efa86b28 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -56,7 +56,6 @@ from pandas.core import algorithms, nanops from pandas.core.aggregation import ( - agg_list_like, maybe_mangle_lambdas, reconstruct_func, validate_func_kwargs, @@ -679,7 +678,12 @@ def describe(self, **kwargs): return result.unstack() def value_counts( - self, normalize=False, sort=True, ascending=False, bins=None, dropna=True + self, + normalize=False, + sort=True, + ascending=False, + bins=None, + dropna: bool = True, ): from pandas.core.reshape.merge import get_join_indexers @@ -978,7 +982,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # try to treat as if we are passing a list try: - result = agg_list_like(self, [func], _axis=self.axis) + result, _ = GroupByApply( + self, [func], args=(), kwds={"_axis": self.axis} + ).agg() # select everything except for the last level, which is the one # containing the name of the function(s), see GH 32040 diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 205bbcc07fc76..f2fd5ca9c62c7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -115,7 +115,7 @@ ) if TYPE_CHECKING: - from pandas import MultiIndex, RangeIndex, Series + from pandas import IntervalIndex, MultiIndex, RangeIndex, Series from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin @@ -2292,14 +2292,6 @@ def _isnan(self): values.fill(False) return values - @cache_readonly - @final - def _nan_idxs(self): - if self._can_hold_na: - return self._isnan.nonzero()[0] - else: - return np.array([], dtype=np.intp) - @cache_readonly def hasnans(self) -> bool: """ @@ -3224,6 +3216,9 @@ def get_loc(self, key, method=None, tolerance=None): except KeyError as err: raise KeyError(key) from err + if is_scalar(key) and isna(key) and not self.hasnans: + raise KeyError(key) + if tolerance is not None: tolerance = self._convert_tolerance(tolerance, np.asarray(key)) @@ -4321,19 +4316,8 @@ def where(self, cond, other=None): >>> idx.where(idx.isin(['car', 'train']), 'other') Index(['car', 'other', 'train', 'other'], dtype='object') """ - if other is None: - other = self._na_value - - values = self.values - - try: - self._validate_fill_value(other) - except (ValueError, TypeError): - return self.astype(object).where(cond, other) - - values = np.where(cond, values, other) - - return Index(values, name=self.name) + cond = np.asarray(cond, dtype=bool) + return self.putmask(~cond, other) # construction helpers @final @@ -4548,17 +4532,32 @@ def putmask(self, mask, value): numpy.ndarray.putmask : Changes elements of an array based on conditional and input values. """ - values = self._values.copy() + mask = np.asarray(mask, dtype=bool) + if mask.shape != self.shape: + raise ValueError("putmask: mask and data must be the same size") + if not mask.any(): + return self.copy() + + if value is None: + value = self._na_value try: converted = self._validate_fill_value(value) except (ValueError, TypeError) as err: if is_object_dtype(self): raise err - # coerces to object - return self.astype(object).putmask(mask, value) + dtype = self._find_common_type_compat(value) + return self.astype(dtype).putmask(mask, value) + + values = self._values.copy() + if isinstance(converted, np.timedelta64) and self.dtype == object: + # https://github.com/numpy/numpy/issues/12550 + # timedelta64 will incorrectly cast to int + converted = [converted] * mask.sum() + values[mask] = converted + else: + np.putmask(values, mask, converted) - np.putmask(values, mask, converted) return type(self)._simple_new(values, name=self.name) def equals(self, other: Any) -> bool: @@ -5195,18 +5194,31 @@ def _maybe_promote(self, other: Index): return self, other - def _find_common_type_compat(self, target: Index) -> DtypeObj: + @final + def _find_common_type_compat(self, target) -> DtypeObj: """ Implementation of find_common_type that adjusts for Index-specific special cases. """ - dtype = find_common_type([self.dtype, target.dtype]) + if is_interval_dtype(self.dtype) and is_valid_nat_for_dtype(target, self.dtype): + # e.g. setting NA value into IntervalArray[int64] + self = cast("IntervalIndex", self) + return IntervalDtype(np.float64, closed=self.closed) + + target_dtype, _ = infer_dtype_from(target, pandas_dtype=True) + dtype = find_common_type([self.dtype, target_dtype]) if dtype.kind in ["i", "u"]: # TODO: what about reversed with self being categorical? - if is_categorical_dtype(target.dtype) and target.hasnans: + if ( + isinstance(target, Index) + and is_categorical_dtype(target.dtype) + and target.hasnans + ): # FIXME: find_common_type incorrect with Categorical GH#38240 # FIXME: some cases where float64 cast can be lossy? dtype = np.dtype(np.float64) + if dtype.kind == "c": + dtype = np.dtype(object) return dtype @final diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 40413bfb40b4b..9841b63029f17 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -799,29 +799,22 @@ def length(self): return Index(self._data.length, copy=False) def putmask(self, mask, value): - arr = self._data.copy() + mask = np.asarray(mask, dtype=bool) + if mask.shape != self.shape: + raise ValueError("putmask: mask and data must be the same size") + if not mask.any(): + return self.copy() + try: - value_left, value_right = arr._validate_setitem_value(value) + self._validate_fill_value(value) except (ValueError, TypeError): - return self.astype(object).putmask(mask, value) + dtype = self._find_common_type_compat(value) + return self.astype(dtype).putmask(mask, value) - if isinstance(self._data._left, np.ndarray): - np.putmask(arr._left, mask, value_left) - np.putmask(arr._right, mask, value_right) - else: - # TODO: special case not needed with __array_function__ - arr._left.putmask(mask, value_left) - arr._right.putmask(mask, value_right) + arr = self._data.copy() + arr.putmask(mask, value) return type(self)._simple_new(arr, name=self.name) - @Appender(Index.where.__doc__) - def where(self, cond, other=None): - if other is None: - other = self._na_value - values = np.where(cond, self._values, other) - result = IntervalArray(values) - return type(self)._simple_new(result, name=self.name) - def insert(self, loc, item): """ Return a new IntervalIndex inserting new item at location. Follows @@ -998,6 +991,9 @@ def func(self, other, sort=sort): # -------------------------------------------------------------------- + def _validate_fill_value(self, value): + return self._data._validate_setitem_value(value) + @property def _is_all_dates(self) -> bool: """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0d30c1665df34..46dbcdaab42b0 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1282,16 +1282,18 @@ def _format_native_types(self, na_rep="nan", **kwargs): # go through the levels and format them for level, level_codes in zip(self.levels, self.codes): - level = level._format_native_types(na_rep=na_rep, **kwargs) + level_strs = level._format_native_types(na_rep=na_rep, **kwargs) # add nan values, if there are any mask = level_codes == -1 if mask.any(): - nan_index = len(level) - level = np.append(level, na_rep) + nan_index = len(level_strs) + # numpy 1.21 deprecated implicit string casting + level_strs = level_strs.astype(str) + level_strs = np.append(level_strs, na_rep) assert not level_codes.flags.writeable # i.e. copy is needed level_codes = level_codes.copy() # make writeable level_codes[mask] = nan_index - new_levels.append(level) + new_levels.append(level_strs) new_codes.append(level_codes) if len(new_levels) == 1: diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index a432b3952666e..777fc1c7c4ad2 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -341,15 +341,6 @@ def get_loc(self, key, method=None, tolerance=None): if is_bool(key): # Catch this to avoid accidentally casting to 1.0 raise KeyError(key) - - if is_float(key) and np.isnan(key): - nan_idxs = self._nan_idxs - if not len(nan_idxs): - raise KeyError(key) - elif len(nan_idxs) == 1: - return nan_idxs[0] - return nan_idxs - return super().get_loc(key, method=method, tolerance=tolerance) # ---------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ce7d5b511e811..cc7c5f666feda 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2407,9 +2407,11 @@ def pred(part) -> bool: """ # true when slice does *not* reduce, False when part is a tuple, # i.e. MultiIndex slice - return (isinstance(part, slice) or is_list_like(part)) and not isinstance( - part, tuple - ) + if isinstance(part, tuple): + # GH#39421 check for sub-slice: + return any((isinstance(s, slice) or is_list_like(s)) for s in part) + else: + return isinstance(part, slice) or is_list_like(part) if not is_list_like(slice_): if not isinstance(slice_, slice): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index eb8bb0fe90e9a..9314666acdaad 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -17,7 +17,7 @@ ) from pandas._libs.internals import BlockPlacement from pandas._libs.tslibs import conversion -from pandas._typing import ArrayLike, Dtype, DtypeObj, Scalar, Shape +from pandas._typing import ArrayLike, Dtype, DtypeObj, Shape from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -29,7 +29,6 @@ infer_dtype_from, maybe_downcast_numeric, maybe_downcast_to_dtype, - maybe_promote, maybe_upcast, soft_convert_objects, ) @@ -44,8 +43,6 @@ is_integer, is_list_like, is_object_dtype, - is_re, - is_re_compilable, is_sparse, pandas_dtype, ) @@ -59,7 +56,11 @@ putmask_smart, putmask_without_repeat, ) -from pandas.core.array_algos.replace import compare_or_regex_search, replace_regex +from pandas.core.array_algos.replace import ( + compare_or_regex_search, + replace_regex, + should_use_regex, +) from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, @@ -239,7 +240,7 @@ def array_values(self) -> ExtensionArray: """ return PandasArray(self.values) - def get_values(self, dtype: Optional[Dtype] = None): + def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: """ return an internal format, currently just the ndarray this is often overridden to handle to_dense like operations @@ -282,7 +283,7 @@ def make_block(self, values, placement=None) -> Block: return make_block(values, placement=placement, ndim=self.ndim) - def make_block_same_class(self, values, placement=None, ndim=None): + def make_block_same_class(self, values, placement=None, ndim=None) -> Block: """ Wrap given values in a block of same type as self. """ if placement is None: placement = self.mgr_locs @@ -318,7 +319,7 @@ def _slice(self, slicer): return self.values[slicer] - def getitem_block(self, slicer, new_mgr_locs=None): + def getitem_block(self, slicer, new_mgr_locs=None) -> Block: """ Perform __getitem__-like, return result as block. @@ -338,11 +339,11 @@ def getitem_block(self, slicer, new_mgr_locs=None): return type(self)._simple_new(new_values, new_mgr_locs, self.ndim) @property - def shape(self): + def shape(self) -> Shape: return self.values.shape @property - def dtype(self): + def dtype(self) -> DtypeObj: return self.values.dtype def iget(self, i): @@ -817,6 +818,12 @@ def _replace_list( """ See BlockManager._replace_list docstring. """ + # TODO: dont special-case Categorical + if self.is_categorical and len(algos.unique(dest_list)) == 1: + # We likely got here by tiling value inside NDFrame.replace, + # so un-tile here + return self.replace(src_list, dest_list[0], inplace, regex) + # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) @@ -827,21 +834,14 @@ def _replace_list( src_len = len(pairs) - 1 - def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: - """ - Generate a bool array by perform an equality check, or perform - an element-wise regular expression matching - """ - if isna(s): - return ~mask - - return compare_or_regex_search(self.values, s, regex, mask) - if self.is_object: # Calculate the mask once, prior to the call of comp # in order to avoid repeating the same computations mask = ~isna(self.values) - masks = [comp(s[0], mask, regex) for s in pairs] + masks = [ + compare_or_regex_search(self.values, s[0], regex=regex, mask=mask) + for s in pairs + ] else: # GH#38086 faster if we know we dont need to check for regex masks = [missing.mask_missing(self.values, s[0]) for s in pairs] @@ -1031,6 +1031,12 @@ def putmask(self, mask, new) -> List[Block]: elif not mask.any(): return [self] + elif isinstance(new, np.timedelta64): + # using putmask with object dtype will incorrect cast to object + # Having excluded self._can_hold_element, we know we cannot operate + # in-place, so we are safe using `where` + return self.where(new, ~mask) + else: # may need to upcast if transpose: @@ -1052,7 +1058,7 @@ def f(mask, val, idx): n = np.array(new) # type of the new block - dtype, _ = maybe_promote(n.dtype) + dtype = find_common_type([n.dtype, val.dtype]) # we need to explicitly astype here to make a copy n = n.astype(dtype) @@ -1063,7 +1069,7 @@ def f(mask, val, idx): new_blocks = self.split_and_operate(mask, f, True) return new_blocks - def coerce_to_target_dtype(self, other): + def coerce_to_target_dtype(self, other) -> Block: """ coerce the current block to a dtype compat for other we will return a block, possibly object, and not raise @@ -1091,13 +1097,13 @@ def interpolate( coerce: bool = False, downcast: Optional[str] = None, **kwargs, - ): + ) -> List[Block]: inplace = validate_bool_kwarg(inplace, "inplace") if not self._can_hold_na: # If there are no NAs, then interpolate is a no-op - return self if inplace else self.copy() + return [self] if inplace else [self.copy()] # a fill na type method try: @@ -1219,7 +1225,9 @@ def func(yvalues: np.ndarray) -> np.ndarray: blocks = [self.make_block_same_class(interp_values)] return self._maybe_downcast(blocks, downcast) - def take_nd(self, indexer, axis: int, new_mgr_locs=None, fill_value=lib.no_default): + def take_nd( + self, indexer, axis: int, new_mgr_locs=None, fill_value=lib.no_default + ) -> Block: """ Take values according to indexer and return them as a block.bb @@ -1256,7 +1264,7 @@ def diff(self, n: int, axis: int = 1) -> List[Block]: new_values = algos.diff(self.values, n, axis=axis, stacklevel=7) return [self.make_block(values=new_values)] - def shift(self, periods: int, axis: int = 0, fill_value=None): + def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> List[Block]: """ shift the block by periods, possibly upcast """ # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also @@ -1309,12 +1317,18 @@ def where(self, other, cond, errors="raise", axis: int = 0) -> List[Block]: blocks = block.where(orig_other, cond, errors=errors, axis=axis) return self._maybe_downcast(blocks, "infer") - # convert datetime to datetime64, timedelta to timedelta64 - other = convert_scalar_for_putitemlike(other, values.dtype) + elif isinstance(other, np.timedelta64): + # expressions.where will cast np.timedelta64 to int + result = self.values.copy() + result[~cond] = [other] * (~cond).sum() + + else: + # convert datetime to datetime64, timedelta to timedelta64 + other = convert_scalar_for_putitemlike(other, values.dtype) - # By the time we get here, we should have all Series/Index - # args extracted to ndarray - result = expressions.where(cond, values, other) + # By the time we get here, we should have all Series/Index + # args extracted to ndarray + result = expressions.where(cond, values, other) if self._can_hold_na or self.ndim == 1: @@ -1369,7 +1383,7 @@ def _unstack(self, unstacker, fill_value, new_placement): blocks = [make_block(new_values, placement=new_placement)] return blocks, mask - def quantile(self, qs, interpolation="linear", axis: int = 0): + def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: """ compute the quantiles of the @@ -1462,7 +1476,7 @@ def _replace_coerce( putmask_inplace(nb.values, mask, value) return [nb] else: - regex = _should_use_regex(regex, to_replace) + regex = should_use_regex(regex, to_replace) if regex: return self._replace_regex( to_replace, @@ -1521,7 +1535,7 @@ def __init__(self, values, placement, ndim: int): raise AssertionError("block.size != values.size") @property - def shape(self): + def shape(self) -> Shape: # TODO(EA2D): override unnecessary with 2D EAs if self.ndim == 1: return (len(self.values),) @@ -1647,7 +1661,7 @@ def setitem(self, indexer, value): self.values[indexer] = value return self - def get_values(self, dtype: Optional[Dtype] = None): + def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: # ExtensionArrays must be iterable, so this works. # TODO(EA2D): reshape not needed with 2D EAs return np.asarray(self.values).reshape(self.shape) @@ -1669,7 +1683,7 @@ def to_native_types(self, na_rep="nan", quoting=None, **kwargs): def take_nd( self, indexer, axis: int = 0, new_mgr_locs=None, fill_value=lib.no_default - ): + ) -> Block: """ Take values according to indexer and return them as a block. """ @@ -1733,7 +1747,9 @@ def _slice(self, slicer): return self.values[slicer] - def fillna(self, value, limit=None, inplace=False, downcast=None): + def fillna( + self, value, limit=None, inplace: bool = False, downcast=None + ) -> List[Block]: values = self.values if inplace else self.values.copy() values = values.fillna(value=value, limit=limit) return [ @@ -1765,9 +1781,7 @@ def diff(self, n: int, axis: int = 1) -> List[Block]: axis = 0 return super().diff(n, axis) - def shift( - self, periods: int, axis: int = 0, fill_value: Any = None - ) -> List[ExtensionBlock]: + def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> List[Block]: """ Shift the block by `periods`. @@ -1947,7 +1961,7 @@ def _holder(self): def fill_value(self): return np.datetime64("NaT", "ns") - def get_values(self, dtype: Optional[Dtype] = None): + def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: """ return object dtype as boxed values, such as Timestamps/Timedelta """ @@ -1996,11 +2010,11 @@ def diff(self, n: int, axis: int = 0) -> List[Block]: TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer, ndim=self.ndim) ] - def shift(self, periods, axis=0, fill_value=None): + def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> List[Block]: # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs values = self.array_values() new_values = values.shift(periods, fill_value=fill_value, axis=axis) - return self.make_block_same_class(new_values) + return [self.make_block_same_class(new_values)] def to_native_types(self, na_rep="NaT", **kwargs): """ convert to our native types format """ @@ -2118,7 +2132,7 @@ def is_view(self) -> bool: # check the ndarray values of the DatetimeIndex values return self.values._data.base is not None - def get_values(self, dtype: Optional[Dtype] = None): + def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: """ Returns an ndarray of values. @@ -2157,7 +2171,9 @@ def external_values(self): return self.values._data return np.asarray(self.values.astype("datetime64[ns]", copy=False)) - def fillna(self, value, limit=None, inplace=False, downcast=None): + def fillna( + self, value, limit=None, inplace: bool = False, downcast=None + ) -> List[Block]: # We support filling a DatetimeTZ with a `value` whose timezone # is different by coercing to object. if self._can_hold_element(value): @@ -2168,7 +2184,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): value, limit=limit, inplace=inplace, downcast=downcast ) - def quantile(self, qs, interpolation="linear", axis=0): + def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: naive = self.values.view("M8[ns]") # TODO(EA2D): kludge for 2D block with 1D values @@ -2228,7 +2244,9 @@ def _maybe_coerce_values(self, values): def _holder(self): return TimedeltaArray - def fillna(self, value, **kwargs): + def fillna( + self, value, limit=None, inplace: bool = False, downcast=None + ) -> List[Block]: # TODO(EA2D): if we operated on array_values, TDA.fillna would handle # raising here. if is_integer(value): @@ -2238,7 +2256,7 @@ def fillna(self, value, **kwargs): "longer supported. To obtain the old behavior, pass " "`pd.Timedelta(seconds=n)` instead." ) - return super().fillna(value, **kwargs) + return super().fillna(value, limit=limit, inplace=inplace, downcast=downcast) class ObjectBlock(Block): @@ -2247,7 +2265,7 @@ class ObjectBlock(Block): _can_hold_na = True def _maybe_coerce_values(self, values): - if issubclass(values.dtype.type, (str, bytes)): + if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) return values @@ -2347,7 +2365,7 @@ def replace( # here with listlike to_replace or value, as those cases # go through _replace_list - regex = _should_use_regex(regex, to_replace) + regex = should_use_regex(regex, to_replace) if regex: return self._replace_regex(to_replace, value, inplace=inplace) @@ -2355,36 +2373,9 @@ def replace( return super().replace(to_replace, value, inplace=inplace, regex=False) -def _should_use_regex(regex: bool, to_replace: Any) -> bool: - """ - Decide whether to treat `to_replace` as a regular expression. - """ - if is_re(to_replace): - regex = True - - regex = regex and is_re_compilable(to_replace) - - # Don't use regex if the pattern is empty. - regex = regex and re.compile(to_replace).pattern != "" - return regex - - class CategoricalBlock(ExtensionBlock): __slots__ = () - def _replace_list( - self, - src_list: List[Any], - dest_list: List[Any], - inplace: bool = False, - regex: bool = False, - ) -> List[Block]: - if len(algos.unique(dest_list)) == 1: - # We likely got here by tiling value inside NDFrame.replace, - # so un-tile here - return self.replace(src_list, dest_list[0], inplace, regex) - return super()._replace_list(src_list, dest_list, inplace, regex) - def replace( self, to_replace, @@ -2450,7 +2441,9 @@ def get_block_type(values, dtype: Optional[Dtype] = None): return cls -def make_block(values, placement, klass=None, ndim=None, dtype: Optional[Dtype] = None): +def make_block( + values, placement, klass=None, ndim=None, dtype: Optional[Dtype] = None +) -> Block: # Ensure that we don't allow PandasArray / PandasDtype in internals. # For now, blocks should be backed by ndarrays when possible. if isinstance(values, ABCPandasArray): @@ -2477,7 +2470,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype: Optional[Dtype] # ----------------------------------------------------------------- -def extend_blocks(result, blocks=None): +def extend_blocks(result, blocks=None) -> List[Block]: """ return a new extended blocks, given the result """ if blocks is None: blocks = [] diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 227ef1e93aeb7..3dcfa85ed5c08 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -3,15 +3,15 @@ from collections import defaultdict import copy import itertools -from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Dict, List, Sequence, cast import numpy as np -from pandas._libs import NaT, internals as libinternals +from pandas._libs import internals as libinternals from pandas._typing import ArrayLike, DtypeObj, Manager, Shape from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.cast import find_common_type, maybe_promote from pandas.core.dtypes.common import ( get_dtype, is_categorical_dtype, @@ -338,7 +338,10 @@ def _concatenate_join_units( # Concatenating join units along ax0 is handled in _merge_blocks. raise AssertionError("Concatenating join units along axis0") - empty_dtype, upcasted_na = _get_empty_dtype_and_na(join_units) + empty_dtype = _get_empty_dtype(join_units) + + has_none_blocks = any(unit.block is None for unit in join_units) + upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) to_concat = [ ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) @@ -375,7 +378,28 @@ def _concatenate_join_units( return concat_values -def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, Any]: +def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): + """ + Find the NA value to go with this dtype. + """ + if is_extension_array_dtype(dtype): + return dtype.na_value + elif dtype.kind in ["m", "M"]: + return dtype.type("NaT") + elif dtype.kind in ["f", "c"]: + return dtype.type("NaN") + elif dtype.kind == "b": + return None + elif dtype.kind in ["i", "u"]: + if not has_none_blocks: + return None + return np.nan + elif dtype.kind == "O": + return np.nan + raise NotImplementedError + + +def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: """ Return dtype and N/A values to use when concatenating specified units. @@ -384,69 +408,66 @@ def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, A Returns ------- dtype - na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: - return np.dtype(np.float64), np.nan + return np.dtype(np.float64) if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype - upcasted_na = join_units[0].block.fill_value - return empty_dtype, upcasted_na - - has_none_blocks = False - dtypes = [None] * len(join_units) - for i, unit in enumerate(join_units): - if unit.block is None: - has_none_blocks = True - else: - dtypes[i] = unit.dtype + return empty_dtype + + has_none_blocks = any(unit.block is None for unit in join_units) + dtypes = [None if unit.block is None else unit.dtype for unit in join_units] + + filtered_dtypes = [ + unit.dtype for unit in join_units if unit.block is not None and not unit.is_na + ] + if not len(filtered_dtypes): + filtered_dtypes = [unit.dtype for unit in join_units if unit.block is not None] + dtype_alt = find_common_type(filtered_dtypes) upcast_classes = _get_upcast_classes(join_units, dtypes) + if is_extension_array_dtype(dtype_alt): + return dtype_alt + elif dtype_alt == object: + return dtype_alt + # TODO: de-duplicate with maybe_promote? # create the result if "extension" in upcast_classes: - if len(upcast_classes) == 1: - cls = upcast_classes["extension"][0] - return cls, cls.na_value - else: - return np.dtype("object"), np.nan - elif "object" in upcast_classes: - return np.dtype(np.object_), np.nan + return np.dtype("object") elif "bool" in upcast_classes: if has_none_blocks: - return np.dtype(np.object_), np.nan + return np.dtype(np.object_) else: - return np.dtype(np.bool_), None - elif "category" in upcast_classes: - return np.dtype(np.object_), np.nan + return np.dtype(np.bool_) elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. dtype = upcast_classes["datetimetz"] - return dtype[0], NaT + return dtype[0] elif "datetime" in upcast_classes: - return np.dtype("M8[ns]"), np.datetime64("NaT", "ns") + return np.dtype("M8[ns]") elif "timedelta" in upcast_classes: - return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") + return np.dtype("m8[ns]") else: try: common_dtype = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray - return np.dtype(np.object_), np.nan + return np.dtype(np.object_) else: if is_float_dtype(common_dtype): - return common_dtype, common_dtype.type(np.nan) + return common_dtype elif is_numeric_dtype(common_dtype): if has_none_blocks: - return np.dtype(np.float64), np.nan + return np.dtype(np.float64) else: - return common_dtype, None + return common_dtype msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg) @@ -481,7 +502,7 @@ def _get_upcast_classes( def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: """Select upcast class name based on dtype.""" if is_categorical_dtype(dtype): - return "category" + return "extension" elif is_datetime64tz_dtype(dtype): return "datetimetz" elif is_extension_array_dtype(dtype): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f1cf1aa9a72cb..c352fa39627d9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1698,7 +1698,7 @@ def construction_error(tot_items, block_shape, axes, e=None): # ----------------------------------------------------------------------- -def _form_blocks(arrays, names: Index, axes) -> List[Block]: +def _form_blocks(arrays, names: Index, axes: List[Index]) -> List[Block]: # put "leftover" items in float bucket, where else? # generalize? items_dict: DefaultDict[str, List] = defaultdict(list) @@ -1716,11 +1716,10 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: extra_locs.append(i) continue - k = names[name_idx] v = arrays[name_idx] block_type = get_block_type(v) - items_dict[block_type.__name__].append((i, k, v)) + items_dict[block_type.__name__].append((i, v)) blocks: List[Block] = [] if len(items_dict["FloatBlock"]): @@ -1742,7 +1741,7 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["DatetimeTZBlock"]): dttz_blocks = [ make_block(array, klass=DatetimeTZBlock, placement=i, ndim=2) - for i, _, array in items_dict["DatetimeTZBlock"] + for i, array in items_dict["DatetimeTZBlock"] ] blocks.extend(dttz_blocks) @@ -1753,14 +1752,14 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["CategoricalBlock"]) > 0: cat_blocks = [ make_block(array, klass=CategoricalBlock, placement=i, ndim=2) - for i, _, array in items_dict["CategoricalBlock"] + for i, array in items_dict["CategoricalBlock"] ] blocks.extend(cat_blocks) if len(items_dict["ExtensionBlock"]): external_blocks = [ make_block(array, klass=ExtensionBlock, placement=i, ndim=2) - for i, _, array in items_dict["ExtensionBlock"] + for i, array in items_dict["ExtensionBlock"] ] blocks.extend(external_blocks) @@ -1768,7 +1767,7 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["ObjectValuesExtensionBlock"]): external_blocks = [ make_block(array, klass=ObjectValuesExtensionBlock, placement=i, ndim=2) - for i, _, array in items_dict["ObjectValuesExtensionBlock"] + for i, array in items_dict["ObjectValuesExtensionBlock"] ] blocks.extend(external_blocks) @@ -1804,7 +1803,7 @@ def _simple_blockify(tuples, dtype) -> List[Block]: def _multi_blockify(tuples, dtype: Optional[Dtype] = None): """ return an array of blocks that potentially have different dtypes """ # group by dtype - grouper = itertools.groupby(tuples, lambda x: x[2].dtype) + grouper = itertools.groupby(tuples, lambda x: x[1].dtype) new_blocks = [] for dtype, tup_block in grouper: @@ -1817,7 +1816,7 @@ def _multi_blockify(tuples, dtype: Optional[Dtype] = None): return new_blocks -def _stack_arrays(tuples, dtype): +def _stack_arrays(tuples, dtype: np.dtype): # fml def _asarray_compat(x): @@ -1826,16 +1825,10 @@ def _asarray_compat(x): else: return np.asarray(x) - def _shape_compat(x) -> Shape: - if isinstance(x, ABCSeries): - return (len(x),) - else: - return x.shape - - placement, names, arrays = zip(*tuples) + placement, arrays = zip(*tuples) first = arrays[0] - shape = (len(arrays),) + _shape_compat(first) + shape = (len(arrays),) + first.shape stacked = np.empty(shape, dtype=dtype) for i, arr in enumerate(arrays): diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 8d3363df0d132..4af1084033ce2 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -417,8 +417,6 @@ def _na_for_min_count( if is_numeric_dtype(values): values = values.astype("float64") fill_value = na_value_for_dtype(values.dtype) - if fill_value is NaT: - fill_value = values.dtype.type("NaT", "ns") if values.ndim == 1: return fill_value diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d389f19598d14..abdc6ac9dfcbe 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -629,16 +629,12 @@ def _convert_level_number(level_num, columns): # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: - tuples = list( - zip( - *[ - lev.take(level_codes) - for lev, level_codes in zip( - this.columns.levels[:-1], this.columns.codes[:-1] - ) - ] - ) - ) + levs = [] + for lev, level_codes in zip(this.columns.levels[:-1], this.columns.codes[:-1]): + if -1 in level_codes: + lev = np.append(lev, None) + levs.append(np.take(lev, level_codes)) + tuples = list(zip(*levs)) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) @@ -650,7 +646,9 @@ def _convert_level_number(level_num, columns): new_data = {} level_vals = this.columns.levels[-1] level_codes = sorted(set(this.columns.codes[-1])) - level_vals_used = level_vals[level_codes] + level_vals_nan = level_vals.insert(len(level_vals), None) + + level_vals_used = np.take(level_vals_nan, level_codes) levsize = len(level_codes) drop_cols = [] for key in unique_groups: @@ -671,7 +669,7 @@ def _convert_level_number(level_num, columns): if slice_len != levsize: chunk = this.loc[:, this.columns[loc]] - chunk.columns = level_vals.take(chunk.columns.codes[-1]) + chunk.columns = level_vals_nan.take(chunk.columns.codes[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_homogeneous_type and is_extension_array_dtype( diff --git a/pandas/core/series.py b/pandas/core/series.py index f75292f32dbca..8bd325beede65 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1043,17 +1043,17 @@ def _set_value(self, label, value, takeable: bool = False): Scalar value. takeable : interpret the index as indexers, default False """ - try: - if takeable: - self._values[label] = value - else: + if not takeable: + try: loc = self.index.get_loc(label) - validate_numeric_casting(self.dtype, value) - self._values[loc] = value - except KeyError: + except KeyError: + # set using a non-recursive method + self.loc[label] = value + return + else: + loc = label - # set using a non-recursive method - self.loc[label] = value + self._set_values(loc, value) # ---------------------------------------------------------------------- # Unsorted diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 2c2e0c16a4482..cfbabab491ae4 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -593,7 +593,7 @@ def compress_group_index(group_index, sort: bool = True): space can be huge, so this function compresses it, by computing offsets (comp_ids) into the list of unique labels (obs_group_ids). """ - size_hint = min(len(group_index), hashtable.SIZE_HINT_LIMIT) + size_hint = len(group_index) table = hashtable.Int64HashTable(size_hint) group_index = ensure_int64(group_index) diff --git a/pandas/core/strings/__init__.py b/pandas/core/strings/__init__.py index 243250f0360a0..943686fc85a05 100644 --- a/pandas/core/strings/__init__.py +++ b/pandas/core/strings/__init__.py @@ -26,7 +26,7 @@ # - PandasArray # - Categorical -from .accessor import StringMethods -from .base import BaseStringArrayMethods +from pandas.core.strings.accessor import StringMethods +from pandas.core.strings.base import BaseStringArrayMethods __all__ = ["StringMethods", "BaseStringArrayMethods"] diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 6ebf610587d30..8e935b7c05300 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -8,22 +8,6 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.indexes.api import MultiIndex -from pandas.core.shared_docs import _shared_docs - -_shared_docs = dict(**_shared_docs) -_doc_template = """ - Returns - ------- - Series or DataFrame - Return type is determined by the caller. - - See Also - -------- - pandas.Series.%(name)s : Calling object with Series data. - pandas.DataFrame.%(name)s : Calling object with DataFrame data. - pandas.Series.%(func_name)s : Similar method for Series. - pandas.DataFrame.%(func_name)s : Similar method for DataFrame. -""" def flex_binary_moment(arg1, arg2, f, pairwise=False): diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py new file mode 100644 index 0000000000000..7e3f7895c0125 --- /dev/null +++ b/pandas/core/window/doc.py @@ -0,0 +1,119 @@ +"""Any shareable docstring components for rolling/expanding/ewm""" +from textwrap import dedent + +from pandas.core.shared_docs import _shared_docs + +_shared_docs = dict(**_shared_docs) + + +def create_section_header(header: str) -> str: + """Create numpydoc section header""" + return "\n".join((header, "-" * len(header))) + "\n" + + +template_header = "Calculate the {window_method} {aggregation_description}.\n\n" + +template_returns = dedent( + """ + Series or DataFrame + Return type is the same as the original object.\n + """ +).replace("\n", "", 1) + +template_see_also = dedent( + """ + pandas.Series.{window_method} : Calling {window_method} with Series data. + pandas.DataFrame.{window_method} : Calling {window_method} with DataFrames. + pandas.Series.{agg_method} : Aggregating {agg_method} for Series. + pandas.DataFrame.{agg_method} : Aggregating {agg_method} for DataFrame.\n + """ +).replace("\n", "", 1) + +args_compat = dedent( + """ + *args + For NumPy compatibility and will not have an effect on the result.\n + """ +).replace("\n", "", 1) + +kwargs_compat = dedent( + """ + **kwargs + For NumPy compatibility and will not have an effect on the result.\n + """ +).replace("\n", "", 1) + +kwargs_scipy = dedent( + """ + **kwargs + Keyword arguments to configure the ``SciPy`` weighted window type.\n + """ +).replace("\n", "", 1) + +window_apply_parameters = dedent( + """ + func : function + Must produce a single value from an ndarray input if ``raw=True`` + or a single value from a Series if ``raw=False``. Can also accept a + Numba JIT function with ``engine='numba'`` specified. + + .. versionchanged:: 1.0.0 + + raw : bool, default None + * ``False`` : passes each row or column as a Series to the + function. + * ``True`` : the passed function will receive ndarray + objects instead. + If you are just applying a NumPy reduction function this will + achieve much better performance. + + engine : str, default None + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + + .. versionadded:: 1.0.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to both the ``func`` and the ``apply`` rolling aggregation. + + .. versionadded:: 1.0.0 + + args : tuple, default None + Positional arguments to be passed into func. + + kwargs : dict, default None + Keyword arguments to be passed into func.\n + """ +).replace("\n", "", 1) + +numba_notes = ( + "See :ref:`window.numba_engine` for extended documentation " + "and performance considerations for the Numba engine.\n" +) + +window_agg_numba_parameters = dedent( + """ + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + + .. versionadded:: 1.3.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.3.0\n + """ +).replace("\n", "", 1) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 6c16ff3edc1d2..e02555fb1e990 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -3,27 +3,31 @@ import datetime from functools import partial from textwrap import dedent -from typing import TYPE_CHECKING, Optional, Union +from typing import Optional, Union import warnings import numpy as np from pandas._libs.tslibs import Timedelta import pandas._libs.window.aggregations as window_aggregations -from pandas._typing import FrameOrSeries, TimedeltaConvertibleTypes +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, TimedeltaConvertibleTypes from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution, doc +from pandas.util._decorators import doc from pandas.core.dtypes.common import is_datetime64_ns_dtype from pandas.core.dtypes.missing import isna import pandas.core.common as common from pandas.core.util.numba_ import maybe_use_numba -from pandas.core.window.common import ( - _doc_template, +from pandas.core.window.common import zsqrt +from pandas.core.window.doc import ( _shared_docs, - flex_binary_moment, - zsqrt, + args_compat, + create_section_header, + kwargs_compat, + template_header, + template_returns, + template_see_also, ) from pandas.core.window.indexers import ( BaseIndexer, @@ -31,20 +35,7 @@ GroupbyIndexer, ) from pandas.core.window.numba_ import generate_numba_groupby_ewma_func -from pandas.core.window.rolling import BaseWindow, BaseWindowGroupby, dispatch - -if TYPE_CHECKING: - from pandas import Series - - -_bias_template = """ - Parameters - ---------- - bias : bool, default False - Use a standard estimation bias correction. - *args, **kwargs - Arguments and keyword arguments to be passed into func. -""" +from pandas.core.window.rolling import BaseWindow, BaseWindowGroupby def get_center_of_mass( @@ -80,13 +71,20 @@ def get_center_of_mass( return float(comass) -def wrap_result(obj: Series, result: np.ndarray) -> Series: +def dispatch(name: str, *args, **kwargs): """ - Wrap a single 1D result. + Dispatch to groupby apply. """ - obj = obj._selected_obj - return obj._constructor(result, obj.index, name=obj.name) + def outer(self, *args, **kwargs): + def f(x): + x = self._shallow_copy(x, groupby=self._groupby) + return getattr(x, name)(*args, **kwargs) + + return self._groupby.apply(f) + + outer.__name__ = name + return outer class ExponentialMovingWindow(BaseWindow): @@ -283,37 +281,33 @@ def _get_window_indexer(self) -> BaseIndexer: """ return ExponentialMovingWindowIndexer() - _agg_see_also_doc = dedent( - """ - See Also - -------- - pandas.DataFrame.rolling.aggregate - """ - ) - - _agg_examples_doc = dedent( - """ - Examples - -------- - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) - >>> df - A B C - 0 1 4 7 - 1 2 5 8 - 2 3 6 9 - - >>> df.ewm(alpha=0.5).mean() - A B C - 0 1.000000 4.000000 7.000000 - 1 1.666667 4.666667 7.666667 - 2 2.428571 5.428571 8.428571 - """ - ) - @doc( _shared_docs["aggregate"], - see_also=_agg_see_also_doc, - examples=_agg_examples_doc, + see_also=dedent( + """ + See Also + -------- + pandas.DataFrame.rolling.aggregate + """ + ), + examples=dedent( + """ + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.ewm(alpha=0.5).mean() + A B C + 0 1.000000 4.000000 7.000000 + 1 1.666667 4.666667 7.666667 + 2 2.428571 5.428571 8.428571 + """ + ), klass="Series/Dataframe", axis="", ) @@ -322,17 +316,20 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate - @Substitution(name="ewm", func_name="mean") - @Appender(_doc_template) + @doc( + template_header, + create_section_header("Parameters"), + args_compat, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="ewm", + aggregation_description="(exponential weighted moment) mean", + agg_method="mean", + ) def mean(self, *args, **kwargs): - """ - Exponential weighted moving average. - - Parameters - ---------- - *args, **kwargs - Arguments and keyword arguments to be passed into func. - """ nv.validate_window_func("mean", args, kwargs) if self.times is not None: window_func = window_aggregations.ewma_time @@ -351,13 +348,26 @@ def mean(self, *args, **kwargs): ) return self._apply(window_func) - @Substitution(name="ewm", func_name="std") - @Appender(_doc_template) - @Appender(_bias_template) - def std(self, bias: bool = False, *args, **kwargs): - """ - Exponential weighted moving stddev. + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + bias : bool, default False + Use a standard estimation bias correction. """ + ).replace("\n", "", 1), + args_compat, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="ewm", + aggregation_description="(exponential weighted moment) standard deviation", + agg_method="std", + ) + def std(self, bias: bool = False, *args, **kwargs): nv.validate_window_func("std", args, kwargs) return zsqrt(self.var(bias=bias, **kwargs)) @@ -372,13 +382,26 @@ def vol(self, bias: bool = False, *args, **kwargs): ) return self.std(bias, *args, **kwargs) - @Substitution(name="ewm", func_name="var") - @Appender(_doc_template) - @Appender(_bias_template) - def var(self, bias: bool = False, *args, **kwargs): - """ - Exponential weighted moving variance. + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + bias : bool, default False + Use a standard estimation bias correction. """ + ).replace("\n", "", 1), + args_compat, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="ewm", + aggregation_description="(exponential weighted moment) variance", + agg_method="var", + ) + def var(self, bias: bool = False, *args, **kwargs): nv.validate_window_func("var", args, kwargs) window_func = window_aggregations.ewmcov window_func = partial( @@ -394,20 +417,11 @@ def var_func(values, begin, end, min_periods): return self._apply(var_func) - @Substitution(name="ewm", func_name="cov") - @Appender(_doc_template) - def cov( - self, - other: Optional[Union[np.ndarray, FrameOrSeries]] = None, - pairwise: Optional[bool] = None, - bias: bool = False, - **kwargs, - ): - """ - Exponential weighted sample covariance. - - Parameters - ---------- + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ other : Series, DataFrame, or ndarray, optional If not supplied then will default to self and produce pairwise output. @@ -420,48 +434,49 @@ def cov( observations will be used. bias : bool, default False Use a standard estimation bias correction. - **kwargs - Keyword arguments to be passed into func. """ - if other is None: - other = self._selected_obj - # only default unset - pairwise = True if pairwise is None else pairwise - other = self._shallow_copy(other) - - def _get_cov(X, Y): - X = self._shallow_copy(X) - Y = self._shallow_copy(Y) - cov = window_aggregations.ewmcov( - X._prep_values(), + ).replace("\n", "", 1), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="ewm", + aggregation_description="(exponential weighted moment) sample covariance", + agg_method="cov", + ) + def cov( + self, + other: Optional[FrameOrSeriesUnion] = None, + pairwise: Optional[bool] = None, + bias: bool = False, + **kwargs, + ): + from pandas import Series + + def cov_func(x, y): + x_array = self._prep_values(x) + y_array = self._prep_values(y) + result = window_aggregations.ewmcov( + x_array, np.array([0], dtype=np.int64), np.array([0], dtype=np.int64), self.min_periods, - Y._prep_values(), + y_array, self.com, self.adjust, self.ignore_na, bias, ) - return wrap_result(X, cov) - - return flex_binary_moment( - self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) - ) + return Series(result, index=x.index, name=x.name) - @Substitution(name="ewm", func_name="corr") - @Appender(_doc_template) - def corr( - self, - other: Optional[Union[np.ndarray, FrameOrSeries]] = None, - pairwise: Optional[bool] = None, - **kwargs, - ): - """ - Exponential weighted sample correlation. + return self._apply_pairwise(self._selected_obj, other, pairwise, cov_func) - Parameters - ---------- + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ other : Series, DataFrame, or ndarray, optional If not supplied then will default to self and produce pairwise output. @@ -472,44 +487,50 @@ def corr( output will be a MultiIndex DataFrame in the case of DataFrame inputs. In the case of missing elements, only complete pairwise observations will be used. - **kwargs - Keyword arguments to be passed into func. """ - if other is None: - other = self._selected_obj - # only default unset - pairwise = True if pairwise is None else pairwise - other = self._shallow_copy(other) + ).replace("\n", "", 1), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="ewm", + aggregation_description="(exponential weighted moment) sample correlation", + agg_method="corr", + ) + def corr( + self, + other: Optional[FrameOrSeriesUnion] = None, + pairwise: Optional[bool] = None, + **kwargs, + ): + from pandas import Series - def _get_corr(X, Y): - X = self._shallow_copy(X) - Y = self._shallow_copy(Y) + def cov_func(x, y): + x_array = self._prep_values(x) + y_array = self._prep_values(y) - def _cov(x, y): + def _cov(X, Y): return window_aggregations.ewmcov( - x, + X, np.array([0], dtype=np.int64), np.array([0], dtype=np.int64), self.min_periods, - y, + Y, self.com, self.adjust, self.ignore_na, 1, ) - x_values = X._prep_values() - y_values = Y._prep_values() with np.errstate(all="ignore"): - cov = _cov(x_values, y_values) - x_var = _cov(x_values, x_values) - y_var = _cov(y_values, y_values) - corr = cov / zsqrt(x_var * y_var) - return wrap_result(X, corr) - - return flex_binary_moment( - self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) - ) + cov = _cov(x_array, y_array) + x_var = _cov(x_array, x_array) + y_var = _cov(y_array, y_array) + result = cov / zsqrt(x_var * y_var) + return Series(result, index=x.index, name=x.name) + + return self._apply_pairwise(self._selected_obj, other, pairwise, cov_func) class ExponentialMovingWindowGroupby(BaseWindowGroupby, ExponentialMovingWindow): diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index d215fd1d84ca5..f91441de41448 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -5,9 +5,20 @@ from pandas._typing import FrameOrSeries from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution, doc - -from pandas.core.window.common import _doc_template, _shared_docs +from pandas.util._decorators import doc + +from pandas.core.window.doc import ( + _shared_docs, + args_compat, + create_section_header, + kwargs_compat, + numba_notes, + template_header, + template_returns, + template_see_also, + window_agg_numba_parameters, + window_apply_parameters, +) from pandas.core.window.indexers import BaseIndexer, ExpandingIndexer, GroupbyIndexer from pandas.core.window.rolling import BaseWindowGroupby, RollingAndExpandingMixin @@ -82,38 +93,34 @@ def _get_window_indexer(self) -> BaseIndexer: """ return ExpandingIndexer() - _agg_see_also_doc = dedent( - """ - See Also - -------- - pandas.DataFrame.aggregate : Similar DataFrame method. - pandas.Series.aggregate : Similar Series method. - """ - ) - - _agg_examples_doc = dedent( - """ - Examples - -------- - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) - >>> df - A B C - 0 1 4 7 - 1 2 5 8 - 2 3 6 9 - - >>> df.ewm(alpha=0.5).mean() - A B C - 0 1.000000 4.000000 7.000000 - 1 1.666667 4.666667 7.666667 - 2 2.428571 5.428571 8.428571 - """ - ) - @doc( _shared_docs["aggregate"], - see_also=_agg_see_also_doc, - examples=_agg_examples_doc, + see_also=dedent( + """ + See Also + -------- + pandas.DataFrame.aggregate : Similar DataFrame method. + pandas.Series.aggregate : Similar Series method. + """ + ), + examples=dedent( + """ + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.ewm(alpha=0.5).mean() + A B C + 0 1.000000 4.000000 7.000000 + 1 1.666667 4.666667 7.666667 + 2 2.428571 5.428571 8.428571 + """ + ), klass="Series/Dataframe", axis="", ) @@ -122,13 +129,31 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate - @Substitution(name="expanding") - @Appender(_shared_docs["count"]) + @doc( + template_header, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="expanding", + aggregation_description="count of non NaN observations", + agg_method="count", + ) def count(self): return super().count() - @Substitution(name="expanding") - @Appender(_shared_docs["apply"]) + @doc( + template_header, + create_section_header("Parameters"), + window_apply_parameters, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="expanding", + aggregation_description="custom aggregation function", + agg_method="apply", + ) def apply( self, func: Callable[..., Any], @@ -147,92 +172,328 @@ def apply( kwargs=kwargs, ) - @Substitution(name="expanding") - @Appender(_shared_docs["sum"]) + @doc( + template_header, + create_section_header("Parameters"), + args_compat, + window_agg_numba_parameters, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + window_method="expanding", + aggregation_description="sum", + agg_method="sum", + ) def sum(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_expanding_func("sum", args, kwargs) return super().sum(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) - @Substitution(name="expanding", func_name="max") - @Appender(_doc_template) - @Appender(_shared_docs["max"]) + @doc( + template_header, + create_section_header("Parameters"), + args_compat, + window_agg_numba_parameters, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + window_method="expanding", + aggregation_description="maximum", + agg_method="max", + ) def max(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_expanding_func("max", args, kwargs) return super().max(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) - @Substitution(name="expanding") - @Appender(_shared_docs["min"]) + @doc( + template_header, + create_section_header("Parameters"), + args_compat, + window_agg_numba_parameters, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + window_method="expanding", + aggregation_description="minimum", + agg_method="min", + ) def min(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_expanding_func("min", args, kwargs) return super().min(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) - @Substitution(name="expanding") - @Appender(_shared_docs["mean"]) + @doc( + template_header, + create_section_header("Parameters"), + args_compat, + window_agg_numba_parameters, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + window_method="expanding", + aggregation_description="mean", + agg_method="mean", + ) def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_expanding_func("mean", args, kwargs) return super().mean(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) - @Substitution(name="expanding") - @Appender(_shared_docs["median"]) + @doc( + template_header, + create_section_header("Parameters"), + window_agg_numba_parameters, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + window_method="expanding", + aggregation_description="median", + agg_method="median", + ) def median(self, engine=None, engine_kwargs=None, **kwargs): return super().median(engine=engine, engine_kwargs=engine_kwargs, **kwargs) - @Substitution(name="expanding", versionadded="") - @Appender(_shared_docs["std"]) + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements.\n + """ + ).replace("\n", "", 1), + args_compat, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "numpy.std : Equivalent method for NumPy array.\n", + template_see_also, + create_section_header("Notes"), + dedent( + """ + The default ``ddof`` of 1 used in :meth:`Series.std` is different + than the default ``ddof`` of 0 in :func:`numpy.std`. + + A minimum of one period is required for the rolling calculation.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + + >>> s.expanding(3).std() + 0 NaN + 1 NaN + 2 0.577350 + 3 0.957427 + 4 0.894427 + 5 0.836660 + 6 0.786796 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="standard deviation", + agg_method="std", + ) def std(self, ddof: int = 1, *args, **kwargs): nv.validate_expanding_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) - @Substitution(name="expanding", versionadded="") - @Appender(_shared_docs["var"]) + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements.\n + """ + ).replace("\n", "", 1), + args_compat, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "numpy.var : Equivalent method for NumPy array.\n", + template_see_also, + create_section_header("Notes"), + dedent( + """ + The default ``ddof`` of 1 used in :meth:`Series.var` is different + than the default ``ddof`` of 0 in :func:`numpy.var`. + + A minimum of one period is required for the rolling calculation.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + + >>> s.expanding(3).var() + 0 NaN + 1 NaN + 2 0.333333 + 3 0.916667 + 4 0.800000 + 5 0.700000 + 6 0.619048 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="variance", + agg_method="var", + ) def var(self, ddof: int = 1, *args, **kwargs): nv.validate_expanding_func("var", args, kwargs) return super().var(ddof=ddof, **kwargs) - @Substitution(name="expanding") - @Appender(_shared_docs["sem"]) + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements.\n + """ + ).replace("\n", "", 1), + args_compat, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + "A minimum of one period is required for the calculation.\n", + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([0, 1, 2, 3]) + + >>> s.expanding().sem() + 0 NaN + 1 0.707107 + 2 0.707107 + 3 0.745356 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="standard error of mean", + agg_method="sem", + ) def sem(self, ddof: int = 1, *args, **kwargs): return super().sem(ddof=ddof, **kwargs) - @Substitution(name="expanding", func_name="skew") - @Appender(_doc_template) - @Appender(_shared_docs["skew"]) + @doc( + template_header, + create_section_header("Parameters"), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "scipy.stats.skew : Third moment of a probability density.\n", + template_see_also, + create_section_header("Notes"), + "A minimum of three periods is required for the rolling calculation.\n", + window_method="expanding", + aggregation_description="unbiased skewness", + agg_method="skew", + ) def skew(self, **kwargs): return super().skew(**kwargs) - _agg_doc = dedent( + @doc( + template_header, + create_section_header("Parameters"), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "scipy.stats.kurtosis : Reference SciPy method.\n", + template_see_also, + create_section_header("Notes"), + "A minimum of four periods is required for the calculation.\n", + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + four matching the equivalent function call using `scipy.stats`. + + >>> arr = [1, 2, 3, 4, 999] + >>> import scipy.stats + >>> print(f"{{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}}") + -1.200000 + >>> print(f"{{scipy.stats.kurtosis(arr, bias=False):.6f}}") + 4.999874 + >>> s = pd.Series(arr) + >>> s.expanding(4).kurt() + 0 NaN + 1 NaN + 2 NaN + 3 -1.200000 + 4 4.999874 + dtype: float64 """ - Examples - -------- - - The example below will show an expanding calculation with a window size of - four matching the equivalent function call using `scipy.stats`. - - >>> arr = [1, 2, 3, 4, 999] - >>> import scipy.stats - >>> print(f"{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}") - -1.200000 - >>> print(f"{scipy.stats.kurtosis(arr, bias=False):.6f}") - 4.999874 - >>> s = pd.Series(arr) - >>> s.expanding(4).kurt() - 0 NaN - 1 NaN - 2 NaN - 3 -1.200000 - 4 4.999874 - dtype: float64 - """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="Fisher's definition of kurtosis without bias", + agg_method="kurt", ) - - @Appender(_agg_doc) - @Substitution(name="expanding") - @Appender(_shared_docs["kurt"]) def kurt(self, **kwargs): return super().kurt(**kwargs) - @Substitution(name="expanding") - @Appender(_shared_docs["quantile"]) + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + quantile : float + Quantile to compute. 0 <= quantile <= 1. + interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}} + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + * linear: `i + (j - i) * fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. + """ + ).replace("\n", "", 1), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="expanding", + aggregation_description="quantile", + agg_method="quantile", + ) def quantile( self, quantile, @@ -245,9 +506,35 @@ def quantile( **kwargs, ) - @Substitution(name="expanding", func_name="cov") - @Appender(_doc_template) - @Appender(_shared_docs["cov"]) + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndexed DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="expanding", + aggregation_description="sample covariance", + agg_method="cov", + ) def cov( self, other: Optional[Union[np.ndarray, FrameOrSeries]] = None, @@ -257,8 +544,62 @@ def cov( ): return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) - @Substitution(name="expanding") - @Appender(_shared_docs["corr"]) + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndexed DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + """ + ).replace("\n", "", 1), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + cov : Similar method to calculate covariance. + numpy.corrcoef : NumPy Pearson's correlation calculation. + """ + ).replace("\n", "", 1), + template_see_also, + create_section_header("Notes"), + dedent( + """ + This function uses Pearson's definition of correlation + (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). + + When `other` is not specified, the output will be self correlation (e.g. + all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise` + set to `True`. + + Function will return ``NaN`` for correlations of equal valued sequences; + this is the result of a 0/0 division error. + + When `pairwise` is set to `False`, only matching columns between `self` and + `other` will be used. + + When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame + with the original index on the first level, and the `other` DataFrame + columns on the second level. + + In the case of missing elements, only complete pairwise observations + will be used. + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="correlation", + agg_method="corr", + ) def corr( self, other: Optional[Union[np.ndarray, FrameOrSeries]] = None, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 439cd586825e1..9a68e470201c7 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -29,7 +29,7 @@ from pandas._typing import ArrayLike, Axis, FrameOrSeries, FrameOrSeriesUnion from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution, doc +from pandas.util._decorators import doc from pandas.core.dtypes.common import ( ensure_float64, @@ -48,17 +48,28 @@ ) from pandas.core.dtypes.missing import notna +from pandas.core.algorithms import factorize from pandas.core.apply import ResamplerWindowApply from pandas.core.base import DataError, SelectionMixin +import pandas.core.common as common from pandas.core.construction import extract_array from pandas.core.groupby.base import GotItemMixin, ShallowMixin from pandas.core.indexes.api import Index, MultiIndex +from pandas.core.reshape.concat import concat from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba -from pandas.core.window.common import ( - _doc_template, +from pandas.core.window.common import flex_binary_moment, zsqrt +from pandas.core.window.doc import ( _shared_docs, - flex_binary_moment, - zsqrt, + args_compat, + create_section_header, + kwargs_compat, + kwargs_scipy, + numba_notes, + template_header, + template_returns, + template_see_also, + window_agg_numba_parameters, + window_apply_parameters, ) from pandas.core.window.indexers import ( BaseIndexer, @@ -398,6 +409,9 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: def _apply_tablewise( self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None ) -> FrameOrSeriesUnion: + """ + Apply the given function to the DataFrame across the entire object + """ if self._selected_obj.ndim == 1: raise ValueError("method='table' not applicable for Series objects.") obj = self._create_data(self._selected_obj) @@ -415,6 +429,23 @@ def _apply_tablewise( self._insert_on_column(out, obj) return out + def _apply_pairwise( + self, + target: FrameOrSeriesUnion, + other: Optional[FrameOrSeriesUnion], + pairwise: Optional[bool], + func: Callable[[FrameOrSeriesUnion, FrameOrSeriesUnion], FrameOrSeriesUnion], + ) -> FrameOrSeriesUnion: + """ + Apply the given pairwise function given 2 pandas objects (DataFrame/Series) + """ + if other is None: + other = target + # only default unset + pairwise = True if pairwise is None else pairwise + + return flex_binary_moment(target, other, func, pairwise=bool(pairwise)) + def _apply( self, func: Callable[..., Any], @@ -486,265 +517,6 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate - _shared_docs["sum"] = dedent( - """ - Calculate %(name)s sum of given DataFrame or Series. - - Parameters - ---------- - *args, **kwargs - For compatibility with other %(name)s methods. Has no effect - on the computed value. - - Returns - ------- - Series or DataFrame - Same type as the input, with the same index, containing the - %(name)s sum. - - See Also - -------- - pandas.Series.sum : Reducing sum for Series. - pandas.DataFrame.sum : Reducing sum for DataFrame. - - Examples - -------- - >>> s = pd.Series([1, 2, 3, 4, 5]) - >>> s - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - dtype: int64 - - >>> s.rolling(3).sum() - 0 NaN - 1 NaN - 2 6.0 - 3 9.0 - 4 12.0 - dtype: float64 - - >>> s.expanding(3).sum() - 0 NaN - 1 NaN - 2 6.0 - 3 10.0 - 4 15.0 - dtype: float64 - - >>> s.rolling(3, center=True).sum() - 0 NaN - 1 6.0 - 2 9.0 - 3 12.0 - 4 NaN - dtype: float64 - - For DataFrame, each %(name)s sum is computed column-wise. - - >>> df = pd.DataFrame({"A": s, "B": s ** 2}) - >>> df - A B - 0 1 1 - 1 2 4 - 2 3 9 - 3 4 16 - 4 5 25 - - >>> df.rolling(3).sum() - A B - 0 NaN NaN - 1 NaN NaN - 2 6.0 14.0 - 3 9.0 29.0 - 4 12.0 50.0 - """ - ) - - _shared_docs["mean"] = dedent( - """ - Calculate the %(name)s mean of the values. - - Parameters - ---------- - *args - Under Review. - **kwargs - Under Review. - - Returns - ------- - Series or DataFrame - Returned object type is determined by the caller of the %(name)s - calculation. - - See Also - -------- - pandas.Series.%(name)s : Calling object with Series data. - pandas.DataFrame.%(name)s : Calling object with DataFrames. - pandas.Series.mean : Equivalent method for Series. - pandas.DataFrame.mean : Equivalent method for DataFrame. - - Examples - -------- - The below examples will show rolling mean calculations with window sizes of - two and three, respectively. - - >>> s = pd.Series([1, 2, 3, 4]) - >>> s.rolling(2).mean() - 0 NaN - 1 1.5 - 2 2.5 - 3 3.5 - dtype: float64 - - >>> s.rolling(3).mean() - 0 NaN - 1 NaN - 2 2.0 - 3 3.0 - dtype: float64 - """ - ) - - _shared_docs["var"] = dedent( - """ - Calculate unbiased %(name)s variance. - %(versionadded)s - Normalized by N-1 by default. This can be changed using the `ddof` - argument. - - Parameters - ---------- - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. - *args, **kwargs - For NumPy compatibility. No additional arguments are used. - - Returns - ------- - Series or DataFrame - Returns the same object type as the caller of the %(name)s calculation. - - See Also - -------- - pandas.Series.%(name)s : Calling object with Series data. - pandas.DataFrame.%(name)s : Calling object with DataFrames. - pandas.Series.var : Equivalent method for Series. - pandas.DataFrame.var : Equivalent method for DataFrame. - numpy.var : Equivalent method for Numpy array. - - Notes - ----- - The default `ddof` of 1 used in :meth:`Series.var` is different than the - default `ddof` of 0 in :func:`numpy.var`. - - A minimum of 1 period is required for the rolling calculation. - - Examples - -------- - >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) - >>> s.rolling(3).var() - 0 NaN - 1 NaN - 2 0.333333 - 3 1.000000 - 4 1.000000 - 5 1.333333 - 6 0.000000 - dtype: float64 - - >>> s.expanding(3).var() - 0 NaN - 1 NaN - 2 0.333333 - 3 0.916667 - 4 0.800000 - 5 0.700000 - 6 0.619048 - dtype: float64 - """ - ) - - _shared_docs["std"] = dedent( - """ - Calculate %(name)s standard deviation. - %(versionadded)s - Normalized by N-1 by default. This can be changed using the `ddof` - argument. - - Parameters - ---------- - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. - *args, **kwargs - For NumPy compatibility. No additional arguments are used. - - Returns - ------- - Series or DataFrame - Returns the same object type as the caller of the %(name)s calculation. - - See Also - -------- - pandas.Series.%(name)s : Calling object with Series data. - pandas.DataFrame.%(name)s : Calling object with DataFrames. - pandas.Series.std : Equivalent method for Series. - pandas.DataFrame.std : Equivalent method for DataFrame. - numpy.std : Equivalent method for Numpy array. - - Notes - ----- - The default `ddof` of 1 used in Series.std is different than the default - `ddof` of 0 in numpy.std. - - A minimum of one period is required for the rolling calculation. - - Examples - -------- - >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) - >>> s.rolling(3).std() - 0 NaN - 1 NaN - 2 0.577350 - 3 1.000000 - 4 1.000000 - 5 1.154701 - 6 0.000000 - dtype: float64 - - >>> s.expanding(3).std() - 0 NaN - 1 NaN - 2 0.577350 - 3 0.957427 - 4 0.894427 - 5 0.836660 - 6 0.786796 - dtype: float64 - """ - ) - - -def dispatch(name: str, *args, **kwargs): - """ - Dispatch to groupby apply. - """ - - def outer(self, *args, **kwargs): - def f(x): - x = self._shallow_copy(x, groupby=self._groupby) - return getattr(x, name)(*args, **kwargs) - - return self._groupby.apply(f) - - outer.__name__ = name - return outer - class BaseWindowGroupby(GotItemMixin, BaseWindow): """ @@ -761,9 +533,6 @@ def __init__(self, obj, *args, **kwargs): self._groupby.grouper.mutated = True super().__init__(obj, *args, **kwargs) - corr = dispatch("corr", other=None, pairwise=None) - cov = dispatch("cov", other=None, pairwise=None) - def _apply( self, func: Callable[..., Any], @@ -826,6 +595,85 @@ def _apply( result.index = result_index return result + def _apply_pairwise( + self, + target: FrameOrSeriesUnion, + other: Optional[FrameOrSeriesUnion], + pairwise: Optional[bool], + func: Callable[[FrameOrSeriesUnion, FrameOrSeriesUnion], FrameOrSeriesUnion], + ) -> FrameOrSeriesUnion: + """ + Apply the given pairwise function given 2 pandas objects (DataFrame/Series) + """ + # Manually drop the grouping column first + target = target.drop(columns=self._groupby.grouper.names, errors="ignore") + result = super()._apply_pairwise(target, other, pairwise, func) + # 1) Determine the levels + codes of the groupby levels + if other is not None: + # When we have other, we must reindex (expand) the result + # from flex_binary_moment to a "transform"-like result + # per groupby combination + old_result_len = len(result) + result = concat( + [ + result.take(gb_indices).reindex(result.index) + for gb_indices in self._groupby.indices.values() + ] + ) + + gb_pairs = ( + common.maybe_make_list(pair) for pair in self._groupby.indices.keys() + ) + groupby_codes = [] + groupby_levels = [] + # e.g. [[1, 2], [4, 5]] as [[1, 4], [2, 5]] + for gb_level_pair in map(list, zip(*gb_pairs)): + labels = np.repeat(np.array(gb_level_pair), old_result_len) + codes, levels = factorize(labels) + groupby_codes.append(codes) + groupby_levels.append(levels) + + else: + # When we evaluate the pairwise=True result, repeat the groupby + # labels by the number of columns in the original object + groupby_codes = self._groupby.grouper.codes + groupby_levels = self._groupby.grouper.levels + + group_indices = self._groupby.grouper.indices.values() + if group_indices: + indexer = np.concatenate(list(group_indices)) + else: + indexer = np.array([], dtype=np.intp) + + if target.ndim == 1: + repeat_by = 1 + else: + repeat_by = len(target.columns) + groupby_codes = [ + np.repeat(c.take(indexer), repeat_by) for c in groupby_codes + ] + # 2) Determine the levels + codes of the result from super()._apply_pairwise + if isinstance(result.index, MultiIndex): + result_codes = list(result.index.codes) + result_levels = list(result.index.levels) + result_names = list(result.index.names) + else: + idx_codes, idx_levels = factorize(result.index) + result_codes = [idx_codes] + result_levels = [idx_levels] + result_names = [result.index.name] + + # 3) Create the resulting index by combining 1) + 2) + result_codes = groupby_codes + result_codes + result_levels = groupby_levels + result_levels + result_names = self._groupby.grouper.names + result_names + + result_index = MultiIndex( + result_levels, result_codes, names=result_names, verify_integrity=False + ) + result.index = result_index + return result + def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: """ Split data into blocks & return conformed data. @@ -1114,38 +962,34 @@ def calc(x): return self._apply_blockwise(homogeneous_func, name) - _agg_see_also_doc = dedent( - """ - See Also - -------- - pandas.DataFrame.aggregate : Similar DataFrame method. - pandas.Series.aggregate : Similar Series method. - """ - ) - - _agg_examples_doc = dedent( - """ - Examples - -------- - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) - >>> df - A B C - 0 1 4 7 - 1 2 5 8 - 2 3 6 9 - - >>> df.rolling(2, win_type="boxcar").agg("mean") - A B C - 0 NaN NaN NaN - 1 1.5 4.5 7.5 - 2 2.5 5.5 8.5 - """ - ) - @doc( _shared_docs["aggregate"], - see_also=_agg_see_also_doc, - examples=_agg_examples_doc, + see_also=dedent( + """ + See Also + -------- + pandas.DataFrame.aggregate : Similar DataFrame method. + pandas.Series.aggregate : Similar Series method. + """ + ), + examples=dedent( + """ + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.rolling(2, win_type="boxcar").agg("mean") + A B C + 0 NaN NaN NaN + 1 1.5 4.5 7.5 + 2 2.5 5.5 8.5 + """ + ), klass="Series/DataFrame", axis="", ) @@ -1160,144 +1004,82 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate - @Substitution(name="window") - @Appender(_shared_docs["sum"]) + @doc( + template_header, + create_section_header("Parameters"), + kwargs_scipy, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="rolling", + aggregation_description="weighted window sum", + agg_method="sum", + ) def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) window_func = window_aggregations.roll_weighted_sum return self._apply(window_func, name="sum", **kwargs) - @Substitution(name="window") - @Appender(_shared_docs["mean"]) + @doc( + template_header, + create_section_header("Parameters"), + kwargs_scipy, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="rolling", + aggregation_description="weighted window mean", + agg_method="mean", + ) def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) window_func = window_aggregations.roll_weighted_mean return self._apply(window_func, name="mean", **kwargs) - @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") - @Appender(_shared_docs["var"]) + @doc( + template_header, + ".. versionadded:: 1.0.0 \n\n", + create_section_header("Parameters"), + kwargs_scipy, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="rolling", + aggregation_description="weighted window variance", + agg_method="var", + ) def var(self, ddof: int = 1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) window_func = partial(window_aggregations.roll_weighted_var, ddof=ddof) kwargs.pop("name", None) return self._apply(window_func, name="var", **kwargs) - @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") - @Appender(_shared_docs["std"]) + @doc( + template_header, + ".. versionadded:: 1.0.0 \n\n", + create_section_header("Parameters"), + kwargs_scipy, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="rolling", + aggregation_description="weighted window standard deviation", + agg_method="std", + ) def std(self, ddof: int = 1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) return zsqrt(self.var(ddof=ddof, name="std", **kwargs)) class RollingAndExpandingMixin(BaseWindow): - - _shared_docs["count"] = dedent( - r""" - The %(name)s count of any non-NaN observations inside the window. - - Returns - ------- - Series or DataFrame - Returned object type is determined by the caller of the %(name)s - calculation. - - See Also - -------- - pandas.Series.%(name)s : Calling object with Series data. - pandas.DataFrame.%(name)s : Calling object with DataFrames. - pandas.DataFrame.count : Count of the full DataFrame. - - Examples - -------- - >>> s = pd.Series([2, 3, np.nan, 10]) - >>> s.rolling(2).count() - 0 1.0 - 1 2.0 - 2 1.0 - 3 1.0 - dtype: float64 - >>> s.rolling(3).count() - 0 1.0 - 1 2.0 - 2 2.0 - 3 2.0 - dtype: float64 - >>> s.rolling(4).count() - 0 1.0 - 1 2.0 - 2 2.0 - 3 3.0 - dtype: float64 - """ - ) - def count(self): window_func = window_aggregations.roll_sum return self._apply(window_func, name="count") - _shared_docs["apply"] = dedent( - r""" - Apply an arbitrary function to each %(name)s window. - - Parameters - ---------- - func : function - Must produce a single value from an ndarray input if ``raw=True`` - or a single value from a Series if ``raw=False``. Can also accept a - Numba JIT function with ``engine='numba'`` specified. - - .. versionchanged:: 1.0.0 - - raw : bool, default None - * ``False`` : passes each row or column as a Series to the - function. - * ``True`` : the passed function will receive ndarray - objects instead. - If you are just applying a NumPy reduction function this will - achieve much better performance. - - engine : str, default None - * ``'cython'`` : Runs rolling apply through C-extensions from cython. - * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. - Only available when ``raw`` is set to ``True``. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - - .. versionadded:: 1.0.0 - - engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be - applied to both the ``func`` and the ``apply`` rolling aggregation. - - .. versionadded:: 1.0.0 - - args : tuple, default None - Positional arguments to be passed into func. - kwargs : dict, default None - Keyword arguments to be passed into func. - - Returns - ------- - Series or DataFrame - Return type is determined by the caller. - - See Also - -------- - pandas.Series.%(name)s : Calling object with Series data. - pandas.DataFrame.%(name)s : Calling object with DataFrame data. - pandas.Series.apply : Similar method for Series. - pandas.DataFrame.apply : Similar method for DataFrame. - - Notes - ----- - See :ref:`window.numba_engine` for extended documentation and performance - considerations for the Numba engine. - """ - ) - def apply( self, func: Callable[..., Any], @@ -1383,34 +1165,6 @@ def sum(self, *args, engine=None, engine_kwargs=None, **kwargs): window_func = window_aggregations.roll_sum return self._apply(window_func, name="sum", **kwargs) - _shared_docs["max"] = dedent( - """ - Calculate the %(name)s maximum. - - Parameters - ---------- - engine : str, default None - * ``'cython'`` : Runs rolling max through C-extensions from cython. - * ``'numba'`` : Runs rolling max through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - - .. versionadded:: 1.3.0 - - engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}`` - - .. versionadded:: 1.3.0 - - **kwargs - For compatibility with other %(name)s methods. Has no effect on - the result. - """ - ) - def max(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_window_func("max", args, kwargs) if maybe_use_numba(engine): @@ -1428,60 +1182,6 @@ def max(self, *args, engine=None, engine_kwargs=None, **kwargs): window_func = window_aggregations.roll_max return self._apply(window_func, name="max", **kwargs) - _shared_docs["min"] = dedent( - """ - Calculate the %(name)s minimum. - - Parameters - ---------- - engine : str, default None - * ``'cython'`` : Runs rolling min through C-extensions from cython. - * ``'numba'`` : Runs rolling min through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - - .. versionadded:: 1.3.0 - - engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}`` - - .. versionadded:: 1.3.0 - - **kwargs - For compatibility with other %(name)s methods. Has no effect on - the result. - - Returns - ------- - Series or DataFrame - Returned object type is determined by the caller of the %(name)s - calculation. - - See Also - -------- - pandas.Series.%(name)s : Calling object with a Series. - pandas.DataFrame.%(name)s : Calling object with a DataFrame. - pandas.Series.min : Similar method for Series. - pandas.DataFrame.min : Similar method for DataFrame. - - Examples - -------- - Performing a rolling minimum with a window size of 3. - - >>> s = pd.Series([4, 3, 5, 2, 6]) - >>> s.rolling(3).min() - 0 NaN - 1 NaN - 2 3.0 - 3 2.0 - 4 2.0 - dtype: float64 - """ - ) - def min(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_window_func("min", args, kwargs) if maybe_use_numba(engine): @@ -1516,59 +1216,6 @@ def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): window_func = window_aggregations.roll_mean return self._apply(window_func, name="mean", **kwargs) - _shared_docs["median"] = dedent( - """ - Calculate the %(name)s median. - - Parameters - ---------- - engine : str, default None - * ``'cython'`` : Runs rolling median through C-extensions from cython. - * ``'numba'`` : Runs rolling median through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - - .. versionadded:: 1.3.0 - - engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}`` - - .. versionadded:: 1.3.0 - - **kwargs - For compatibility with other %(name)s methods. Has no effect - on the computed result. - - Returns - ------- - Series or DataFrame - Returned type is the same as the original object. - - See Also - -------- - pandas.Series.%(name)s : Calling object with Series data. - pandas.DataFrame.%(name)s : Calling object with DataFrames. - pandas.Series.median : Equivalent method for Series. - pandas.DataFrame.median : Equivalent method for DataFrame. - - Examples - -------- - Compute the rolling median of a series with a window size of 3. - - >>> s = pd.Series([0, 1, 2, 3, 4]) - >>> s.rolling(3).median() - 0 NaN - 1 NaN - 2 1.0 - 3 2.0 - 4 3.0 - dtype: float64 - """ - ) - def median(self, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): if self.method == "table": @@ -1607,18 +1254,6 @@ def var(self, ddof: int = 1, *args, **kwargs): **kwargs, ) - _shared_docs[ - "skew" - ] = """ - Unbiased %(name)s skewness. - - Parameters - ---------- - **kwargs - For compatibility with other %(name)s methods. Has no effect on - the result. - """ - def skew(self, **kwargs): window_func = window_aggregations.roll_skew return self._apply( @@ -1627,92 +1262,9 @@ def skew(self, **kwargs): **kwargs, ) - _shared_docs["kurt"] = dedent( - """ - Calculate unbiased %(name)s kurtosis. - - This function uses Fisher's definition of kurtosis without bias. - - Parameters - ---------- - **kwargs - For compatibility with other %(name)s methods. Has no effect on - the result. - - Returns - ------- - Series or DataFrame - Returned object type is determined by the caller of the %(name)s - calculation. - - See Also - -------- - pandas.Series.%(name)s : Calling object with Series data. - pandas.DataFrame.%(name)s : Calling object with DataFrames. - pandas.Series.kurt : Equivalent method for Series. - pandas.DataFrame.kurt : Equivalent method for DataFrame. - scipy.stats.skew : Third moment of a probability density. - scipy.stats.kurtosis : Reference SciPy method. - - Notes - ----- - A minimum of 4 periods is required for the %(name)s calculation. - """ - ) - def sem(self, ddof: int = 1, *args, **kwargs): return self.std(*args, **kwargs) / (self.count() - ddof).pow(0.5) - _shared_docs["sem"] = dedent( - """ - Compute %(name)s standard error of mean. - - Parameters - ---------- - - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. - - *args, **kwargs - For NumPy compatibility. No additional arguments are used. - - Returns - ------- - Series or DataFrame - Returned object type is determined by the caller of the %(name)s - calculation. - - See Also - -------- - pandas.Series.%(name)s : Calling object with Series data. - pandas.DataFrame.%(name)s : Calling object with DataFrames. - pandas.Series.sem : Equivalent method for Series. - pandas.DataFrame.sem : Equivalent method for DataFrame. - - Notes - ----- - A minimum of one period is required for the rolling calculation. - - Examples - -------- - >>> s = pd.Series([0, 1, 2, 3]) - >>> s.rolling(2, min_periods=1).sem() - 0 NaN - 1 0.707107 - 2 0.707107 - 3 0.707107 - dtype: float64 - - >>> s.expanding().sem() - 0 NaN - 1 0.707107 - 2 0.707107 - 3 0.745356 - dtype: float64 - """ - ) - def kurt(self, **kwargs): window_func = window_aggregations.roll_kurt return self._apply( @@ -1721,78 +1273,6 @@ def kurt(self, **kwargs): **kwargs, ) - _shared_docs["quantile"] = dedent( - """ - Calculate the %(name)s quantile. - - Parameters - ---------- - quantile : float - Quantile to compute. 0 <= quantile <= 1. - - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} - This optional parameter specifies the interpolation method to use, - when the desired quantile lies between two data points `i` and `j`: - - * linear: `i + (j - i) * fraction`, where `fraction` is the - fractional part of the index surrounded by `i` and `j`. - * lower: `i`. - * higher: `j`. - * nearest: `i` or `j` whichever is nearest. - * midpoint: (`i` + `j`) / 2. - - engine : str, default None - * ``'cython'`` : Runs rolling quantile through C-extensions from cython. - * ``'numba'`` : Runs rolling quantile through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - - .. versionadded:: 1.3.0 - - engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}`` - - .. versionadded:: 1.3.0 - - **kwargs - For compatibility with other %(name)s methods. Has no effect on - the result. - - Returns - ------- - Series or DataFrame - Returned object type is determined by the caller of the %(name)s - calculation. - - See Also - -------- - pandas.Series.quantile : Computes value at the given quantile over all data - in Series. - pandas.DataFrame.quantile : Computes values at the given quantile over - requested axis in DataFrame. - - Examples - -------- - >>> s = pd.Series([1, 2, 3, 4]) - >>> s.rolling(2).quantile(.4, interpolation='lower') - 0 NaN - 1 1.0 - 2 2.0 - 3 3.0 - dtype: float64 - - >>> s.rolling(2).quantile(.4, interpolation='midpoint') - 0 NaN - 1 1.5 - 2 2.5 - 3 3.5 - dtype: float64 - """ - ) - def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): if quantile == 1.0: window_func = window_aggregations.roll_max @@ -1807,36 +1287,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): return self._apply(window_func, name="quantile", **kwargs) - _shared_docs[ - "cov" - ] = """ - Calculate the %(name)s sample covariance. - - Parameters - ---------- - other : Series, DataFrame, or ndarray, optional - If not supplied then will default to self and produce pairwise - output. - pairwise : bool, default None - If False then only matching columns between self and other will be - used and the output will be a DataFrame. - If True then all pairwise combinations will be calculated and the - output will be a MultiIndexed DataFrame in the case of DataFrame - inputs. In the case of missing elements, only complete pairwise - observations will be used. - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. - **kwargs - Keyword arguments to be passed into func. - """ - def cov(self, other=None, pairwise=None, ddof=1, **kwargs): - if other is None: - other = self._selected_obj - # only default unset - pairwise = True if pairwise is None else pairwise - from pandas import Series def cov_func(x, y): @@ -1866,125 +1317,9 @@ def cov_func(x, y): result = (mean_x_y - mean_x * mean_y) * (count_x_y / (count_x_y - ddof)) return Series(result, index=x.index, name=x.name) - return flex_binary_moment( - self._selected_obj, other, cov_func, pairwise=bool(pairwise) - ) - - _shared_docs["corr"] = dedent( - """ - Calculate %(name)s correlation. - - Parameters - ---------- - other : Series, DataFrame, or ndarray, optional - If not supplied then will default to self. - pairwise : bool, default None - Calculate pairwise combinations of columns within a - DataFrame. If `other` is not specified, defaults to `True`, - otherwise defaults to `False`. - Not relevant for :class:`~pandas.Series`. - **kwargs - Unused. - - Returns - ------- - Series or DataFrame - Returned object type is determined by the caller of the - %(name)s calculation. - - See Also - -------- - pandas.Series.%(name)s : Calling object with Series data. - pandas.DataFrame.%(name)s : Calling object with DataFrames. - pandas.Series.corr : Equivalent method for Series. - pandas.DataFrame.corr : Equivalent method for DataFrame. - cov : Similar method to calculate covariance. - numpy.corrcoef : NumPy Pearson's correlation calculation. - - Notes - ----- - This function uses Pearson's definition of correlation - (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). - - When `other` is not specified, the output will be self correlation (e.g. - all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise` - set to `True`. - - Function will return ``NaN`` for correlations of equal valued sequences; - this is the result of a 0/0 division error. - - When `pairwise` is set to `False`, only matching columns between `self` and - `other` will be used. - - When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame - with the original index on the first level, and the `other` DataFrame - columns on the second level. - - In the case of missing elements, only complete pairwise observations - will be used. - - Examples - -------- - The below example shows a rolling calculation with a window size of - four matching the equivalent function call using :meth:`numpy.corrcoef`. - - >>> v1 = [3, 3, 3, 5, 8] - >>> v2 = [3, 4, 4, 4, 8] - >>> # numpy returns a 2X2 array, the correlation coefficient - >>> # is the number at entry [0][1] - >>> print(f"{np.corrcoef(v1[:-1], v2[:-1])[0][1]:.6f}") - 0.333333 - >>> print(f"{np.corrcoef(v1[1:], v2[1:])[0][1]:.6f}") - 0.916949 - >>> s1 = pd.Series(v1) - >>> s2 = pd.Series(v2) - >>> s1.rolling(4).corr(s2) - 0 NaN - 1 NaN - 2 NaN - 3 0.333333 - 4 0.916949 - dtype: float64 - - The below example shows a similar rolling calculation on a - DataFrame using the pairwise option. - - >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.],\ - [46., 31.], [50., 36.]]) - >>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7)) - [[1. 0.6263001] - [0.6263001 1. ]] - >>> print(np.corrcoef(matrix[1:,0], matrix[1:,1]).round(7)) - [[1. 0.5553681] - [0.5553681 1. ]] - >>> df = pd.DataFrame(matrix, columns=['X','Y']) - >>> df - X Y - 0 51.0 35.0 - 1 49.0 30.0 - 2 47.0 32.0 - 3 46.0 31.0 - 4 50.0 36.0 - >>> df.rolling(4).corr(pairwise=True) - X Y - 0 X NaN NaN - Y NaN NaN - 1 X NaN NaN - Y NaN NaN - 2 X NaN NaN - Y NaN NaN - 3 X 1.000000 0.626300 - Y 0.626300 1.000000 - 4 X 1.000000 0.555368 - Y 0.555368 1.000000 - """ - ) + return self._apply_pairwise(self._selected_obj, other, pairwise, cov_func) def corr(self, other=None, pairwise=None, ddof=1, **kwargs): - if other is None: - other = self._selected_obj - # only default unset - pairwise = True if pairwise is None else pairwise from pandas import Series @@ -2025,9 +1360,7 @@ def corr_func(x, y): result = numerator / denominator return Series(result, index=x.index, name=x.name) - return flex_binary_moment( - self._selected_obj, other, corr_func, pairwise=bool(pairwise) - ) + return self._apply_pairwise(self._selected_obj, other, pairwise, corr_func) class Rolling(RollingAndExpandingMixin): @@ -2087,44 +1420,40 @@ def _raise_monotonic_error(self): formatted = "index" raise ValueError(f"{formatted} must be monotonic") - _agg_see_also_doc = dedent( - """ - See Also - -------- - pandas.Series.rolling : Calling object with Series data. - pandas.DataFrame.rolling : Calling object with DataFrame data. - """ - ) - - _agg_examples_doc = dedent( - """ - Examples - -------- - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) - >>> df - A B C - 0 1 4 7 - 1 2 5 8 - 2 3 6 9 - - >>> df.rolling(2).sum() - A B C - 0 NaN NaN NaN - 1 3.0 9.0 15.0 - 2 5.0 11.0 17.0 - - >>> df.rolling(2).agg({"A": "sum", "B": "min"}) - A B - 0 NaN NaN - 1 3.0 4.0 - 2 5.0 5.0 - """ - ) - @doc( _shared_docs["aggregate"], - see_also=_agg_see_also_doc, - examples=_agg_examples_doc, + see_also=dedent( + """ + See Also + -------- + pandas.Series.rolling : Calling object with Series data. + pandas.DataFrame.rolling : Calling object with DataFrame data. + """ + ), + examples=dedent( + """ + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.rolling(2).sum() + A B C + 0 NaN NaN NaN + 1 3.0 9.0 15.0 + 2 5.0 11.0 17.0 + + >>> df.rolling(2).agg({"A": "sum", "B": "min"}) + A B + 0 NaN NaN + 1 3.0 4.0 + 2 5.0 5.0 + """ + ), klass="Series/Dataframe", axis="", ) @@ -2133,8 +1462,40 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate - @Substitution(name="rolling") - @Appender(_shared_docs["count"]) + @doc( + template_header, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([2, 3, np.nan, 10]) + >>> s.rolling(2).count() + 0 1.0 + 1 2.0 + 2 1.0 + 3 1.0 + dtype: float64 + >>> s.rolling(3).count() + 0 1.0 + 1 2.0 + 2 2.0 + 3 2.0 + dtype: float64 + >>> s.rolling(4).count() + 0 1.0 + 1 2.0 + 2 2.0 + 3 3.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="count of non NaN observations", + agg_method="count", + ) def count(self): if self.min_periods is None: warnings.warn( @@ -2146,10 +1507,24 @@ def count(self): FutureWarning, ) self.min_periods = 0 - return super().count() + result = super().count() + self.min_periods = None + else: + result = super().count() + return result - @Substitution(name="rolling") - @Appender(_shared_docs["apply"]) + @doc( + template_header, + create_section_header("Parameters"), + window_apply_parameters, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="rolling", + aggregation_description="custom aggregation function", + agg_method="apply", + ) def apply( self, func, raw=False, engine=None, engine_kwargs=None, args=None, kwargs=None ): @@ -2162,92 +1537,444 @@ def apply( kwargs=kwargs, ) - @Substitution(name="rolling") - @Appender(_shared_docs["sum"]) + @doc( + template_header, + create_section_header("Parameters"), + args_compat, + window_agg_numba_parameters, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1, 2, 3, 4, 5]) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + dtype: int64 + + >>> s.rolling(3).sum() + 0 NaN + 1 NaN + 2 6.0 + 3 9.0 + 4 12.0 + dtype: float64 + + >>> s.rolling(3, center=True).sum() + 0 NaN + 1 6.0 + 2 9.0 + 3 12.0 + 4 NaN + dtype: float64 + + For DataFrame, each sum is computed column-wise. + + >>> df = pd.DataFrame({{"A": s, "B": s ** 2}}) + >>> df + A B + 0 1 1 + 1 2 4 + 2 3 9 + 3 4 16 + 4 5 25 + + >>> df.rolling(3).sum() + A B + 0 NaN NaN + 1 NaN NaN + 2 6.0 14.0 + 3 9.0 29.0 + 4 12.0 50.0 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="sum", + agg_method="sum", + ) def sum(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_rolling_func("sum", args, kwargs) return super().sum(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) - @Substitution(name="rolling", func_name="max") - @Appender(_doc_template) - @Appender(_shared_docs["max"]) + @doc( + template_header, + create_section_header("Parameters"), + args_compat, + window_agg_numba_parameters, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + window_method="rolling", + aggregation_description="maximum", + agg_method="max", + ) def max(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_rolling_func("max", args, kwargs) return super().max(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) - @Substitution(name="rolling") - @Appender(_shared_docs["min"]) + @doc( + template_header, + create_section_header("Parameters"), + args_compat, + window_agg_numba_parameters, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """ + Performing a rolling minimum with a window size of 3. + + >>> s = pd.Series([4, 3, 5, 2, 6]) + >>> s.rolling(3).min() + 0 NaN + 1 NaN + 2 3.0 + 3 2.0 + 4 2.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="minimum", + agg_method="min", + ) def min(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_rolling_func("min", args, kwargs) return super().min(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) - @Substitution(name="rolling") - @Appender(_shared_docs["mean"]) + @doc( + template_header, + create_section_header("Parameters"), + args_compat, + window_agg_numba_parameters, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """ + The below examples will show rolling mean calculations with window sizes of + two and three, respectively. + + >>> s = pd.Series([1, 2, 3, 4]) + >>> s.rolling(2).mean() + 0 NaN + 1 1.5 + 2 2.5 + 3 3.5 + dtype: float64 + + >>> s.rolling(3).mean() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="mean", + agg_method="mean", + ) def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_rolling_func("mean", args, kwargs) return super().mean(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) - @Substitution(name="rolling") - @Appender(_shared_docs["median"]) + @doc( + template_header, + create_section_header("Parameters"), + window_agg_numba_parameters, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """ + Compute the rolling median of a series with a window size of 3. + + >>> s = pd.Series([0, 1, 2, 3, 4]) + >>> s.rolling(3).median() + 0 NaN + 1 NaN + 2 1.0 + 3 2.0 + 4 3.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="median", + agg_method="median", + ) def median(self, engine=None, engine_kwargs=None, **kwargs): return super().median(engine=engine, engine_kwargs=engine_kwargs, **kwargs) - @Substitution(name="rolling", versionadded="") - @Appender(_shared_docs["std"]) + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), + args_compat, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "numpy.std : Equivalent method for NumPy array.\n", + template_see_also, + create_section_header("Notes"), + dedent( + """ + The default ``ddof`` of 1 used in :meth:`Series.std` is different + than the default ``ddof`` of 0 in :func:`numpy.std`. + + A minimum of one period is required for the rolling calculation. + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + >>> s.rolling(3).std() + 0 NaN + 1 NaN + 2 0.577350 + 3 1.000000 + 4 1.000000 + 5 1.154701 + 6 0.000000 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="standard deviation", + agg_method="std", + ) def std(self, ddof=1, *args, **kwargs): nv.validate_rolling_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) - @Substitution(name="rolling", versionadded="") - @Appender(_shared_docs["var"]) + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), + args_compat, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "numpy.var : Equivalent method for NumPy array.\n", + template_see_also, + create_section_header("Notes"), + dedent( + """ + The default ``ddof`` of 1 used in :meth:`Series.var` is different + than the default ``ddof`` of 0 in :func:`numpy.var`. + + A minimum of one period is required for the rolling calculation. + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + >>> s.rolling(3).var() + 0 NaN + 1 NaN + 2 0.333333 + 3 1.000000 + 4 1.000000 + 5 1.333333 + 6 0.000000 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="variance", + agg_method="var", + ) def var(self, ddof=1, *args, **kwargs): nv.validate_rolling_func("var", args, kwargs) return super().var(ddof=ddof, **kwargs) - @Substitution(name="rolling", func_name="skew") - @Appender(_doc_template) - @Appender(_shared_docs["skew"]) + @doc( + template_header, + create_section_header("Parameters"), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "scipy.stats.skew : Third moment of a probability density.\n", + template_see_also, + create_section_header("Notes"), + "A minimum of three periods is required for the rolling calculation.\n", + window_method="rolling", + aggregation_description="unbiased skewness", + agg_method="skew", + ) def skew(self, **kwargs): return super().skew(**kwargs) - @Substitution(name="rolling") - @Appender(_shared_docs["sem"]) + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), + args_compat, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + "A minimum of one period is required for the calculation.\n", + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([0, 1, 2, 3]) + >>> s.rolling(2, min_periods=1).sem() + 0 NaN + 1 0.707107 + 2 0.707107 + 3 0.707107 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="standard error of mean", + agg_method="sem", + ) def sem(self, ddof=1, *args, **kwargs): return self.std(*args, **kwargs) / (self.count() - ddof).pow(0.5) - _agg_doc = dedent( + @doc( + template_header, + create_section_header("Parameters"), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "scipy.stats.kurtosis : Reference SciPy method.\n", + template_see_also, + create_section_header("Notes"), + "A minimum of four periods is required for the calculation.\n", + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + four matching the equivalent function call using `scipy.stats`. + + >>> arr = [1, 2, 3, 4, 999] + >>> import scipy.stats + >>> print(f"{{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}}") + -1.200000 + >>> print(f"{{scipy.stats.kurtosis(arr[1:], bias=False):.6f}}") + 3.999946 + >>> s = pd.Series(arr) + >>> s.rolling(4).kurt() + 0 NaN + 1 NaN + 2 NaN + 3 -1.200000 + 4 3.999946 + dtype: float64 """ - Examples - -------- - - The example below will show a rolling calculation with a window size of - four matching the equivalent function call using `scipy.stats`. - - >>> arr = [1, 2, 3, 4, 999] - >>> import scipy.stats - >>> print(f"{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}") - -1.200000 - >>> print(f"{scipy.stats.kurtosis(arr[1:], bias=False):.6f}") - 3.999946 - >>> s = pd.Series(arr) - >>> s.rolling(4).kurt() - 0 NaN - 1 NaN - 2 NaN - 3 -1.200000 - 4 3.999946 - dtype: float64 - """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="Fisher's definition of kurtosis without bias", + agg_method="kurt", ) - - @Appender(_agg_doc) - @Substitution(name="rolling") - @Appender(_shared_docs["kurt"]) def kurt(self, **kwargs): return super().kurt(**kwargs) - @Substitution(name="rolling") - @Appender(_shared_docs["quantile"]) + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + quantile : float + Quantile to compute. 0 <= quantile <= 1. + interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}} + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + * linear: `i + (j - i) * fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. + """ + ).replace("\n", "", 1), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1, 2, 3, 4]) + >>> s.rolling(2).quantile(.4, interpolation='lower') + 0 NaN + 1 1.0 + 2 2.0 + 3 3.0 + dtype: float64 + + >>> s.rolling(2).quantile(.4, interpolation='midpoint') + 0 NaN + 1 1.5 + 2 2.5 + 3 3.5 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="quantile", + agg_method="quantile", + ) def quantile(self, quantile, interpolation="linear", **kwargs): return super().quantile( quantile=quantile, @@ -2255,14 +1982,154 @@ def quantile(self, quantile, interpolation="linear", **kwargs): **kwargs, ) - @Substitution(name="rolling", func_name="cov") - @Appender(_doc_template) - @Appender(_shared_docs["cov"]) + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndexed DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also[:-1], + window_method="rolling", + aggregation_description="sample covariance", + agg_method="cov", + ) def cov(self, other=None, pairwise=None, ddof=1, **kwargs): return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) - @Substitution(name="rolling") - @Appender(_shared_docs["corr"]) + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndexed DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + cov : Similar method to calculate covariance. + numpy.corrcoef : NumPy Pearson's correlation calculation. + """ + ).replace("\n", "", 1), + template_see_also, + create_section_header("Notes"), + dedent( + """ + This function uses Pearson's definition of correlation + (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). + + When `other` is not specified, the output will be self correlation (e.g. + all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise` + set to `True`. + + Function will return ``NaN`` for correlations of equal valued sequences; + this is the result of a 0/0 division error. + + When `pairwise` is set to `False`, only matching columns between `self` and + `other` will be used. + + When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame + with the original index on the first level, and the `other` DataFrame + columns on the second level. + + In the case of missing elements, only complete pairwise observations + will be used. + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The below example shows a rolling calculation with a window size of + four matching the equivalent function call using :meth:`numpy.corrcoef`. + + >>> v1 = [3, 3, 3, 5, 8] + >>> v2 = [3, 4, 4, 4, 8] + >>> # numpy returns a 2X2 array, the correlation coefficient + >>> # is the number at entry [0][1] + >>> print(f"{{np.corrcoef(v1[:-1], v2[:-1])[0][1]:.6f}}") + 0.333333 + >>> print(f"{{np.corrcoef(v1[1:], v2[1:])[0][1]:.6f}}") + 0.916949 + >>> s1 = pd.Series(v1) + >>> s2 = pd.Series(v2) + >>> s1.rolling(4).corr(s2) + 0 NaN + 1 NaN + 2 NaN + 3 0.333333 + 4 0.916949 + dtype: float64 + + The below example shows a similar rolling calculation on a + DataFrame using the pairwise option. + + >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.],\ + [46., 31.], [50., 36.]]) + >>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7)) + [[1. 0.6263001] + [0.6263001 1. ]] + >>> print(np.corrcoef(matrix[1:,0], matrix[1:,1]).round(7)) + [[1. 0.5553681] + [0.5553681 1. ]] + >>> df = pd.DataFrame(matrix, columns=['X','Y']) + >>> df + X Y + 0 51.0 35.0 + 1 49.0 30.0 + 2 47.0 32.0 + 3 46.0 31.0 + 4 50.0 36.0 + >>> df.rolling(4).corr(pairwise=True) + X Y + 0 X NaN NaN + Y NaN NaN + 1 X NaN NaN + Y NaN NaN + 2 X NaN NaN + Y NaN NaN + 3 X 1.000000 0.626300 + Y 0.626300 1.000000 + 4 X 1.000000 0.555368 + Y 0.555368 1.000000 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="correlation", + agg_method="corr", + ) def corr(self, other=None, pairwise=None, ddof=1, **kwargs): return super().corr(other=other, pairwise=pairwise, ddof=ddof, **kwargs) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 213be7c05b370..84b5cae09acce 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1069,26 +1069,37 @@ def __init__( xlrd_version = LooseVersion(get_version(xlrd)) - if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): - ext = "xls" - else: - ext = inspect_excel_format( - content_or_path=path_or_buffer, storage_options=storage_options - ) - + ext = None if engine is None: + # Only determine ext if it is needed + if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + content_or_path=path_or_buffer, storage_options=storage_options + ) + # ext will always be valid, otherwise inspect_excel_format would raise engine = config.get_option(f"io.excel.{ext}.reader", silent=True) if engine == "auto": engine = get_default_engine(ext, mode="reader") - if engine == "xlrd" and ext != "xls" and xlrd_version is not None: - if xlrd_version >= "2": + if engine == "xlrd" and xlrd_version is not None: + if ext is None: + # Need ext to determine ext in order to raise/warn + if isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + path_or_buffer, storage_options=storage_options + ) + + if ext != "xls" and xlrd_version >= "2": raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) - else: + elif ext != "xls": caller = inspect.stack()[1] if ( caller.filename.endswith( diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 1de4bf5730a56..6733a82906616 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,11 +1,12 @@ from __future__ import annotations +from distutils.version import LooseVersion from typing import TYPE_CHECKING, Dict, List, Optional import numpy as np from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions -from pandas.compat._optional import import_optional_dependency +from pandas.compat._optional import get_version, import_optional_dependency from pandas.io.excel._base import BaseExcelReader, ExcelWriter from pandas.io.excel._util import validate_freeze_panes @@ -528,7 +529,16 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: # GH 39001 # Reading of excel file depends on dimension data being correct but # writers sometimes omit or get it wrong - sheet.reset_dimensions() + import openpyxl + + version = LooseVersion(get_version(openpyxl)) + + # There is no good way of determining if a sheet is read-only + # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605 + is_readonly = hasattr(sheet, "reset_dimensions") + + if version >= "3.0.0" and is_readonly: + sheet.reset_dimensions() data: List[List[Scalar]] = [] last_row_with_data = -1 @@ -538,15 +548,17 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: last_row_with_data = row_number data.append(converted_row) - if len(data) > 0: - # Trim trailing empty rows - data = data[: last_row_with_data + 1] + # Trim trailing empty rows + data = data[: last_row_with_data + 1] + if version >= "3.0.0" and is_readonly and len(data) > 0: # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: + empty_cell: List[Scalar] = [""] data = [ - data_row + (max_width - len(data_row)) * [""] for data_row in data + data_row + (max_width - len(data_row)) * empty_cell + for data_row in data ] return data diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 0cb9aa3bea6ab..6eac9ba87c73d 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -42,7 +42,9 @@ from pandas.core.indexing import maybe_numeric_slice, non_reducing_slice jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") - +CSSSequence = Sequence[Tuple[str, Union[str, int, float]]] +CSSProperties = Union[str, CSSSequence] +CSSStyles = List[Dict[str, CSSProperties]] try: from matplotlib import colors @@ -147,7 +149,7 @@ def __init__( self, data: FrameOrSeriesUnion, precision: Optional[int] = None, - table_styles: Optional[List[Dict[str, List[Tuple[str, str]]]]] = None, + table_styles: Optional[CSSStyles] = None, uuid: Optional[str] = None, caption: Optional[str] = None, table_attributes: Optional[str] = None, @@ -267,7 +269,7 @@ def set_tooltips(self, ttips: DataFrame) -> Styler: def set_tooltips_class( self, name: Optional[str] = None, - properties: Optional[Sequence[Tuple[str, Union[str, int, float]]]] = None, + properties: Optional[CSSProperties] = None, ) -> Styler: """ Manually configure the name and/or properties of the class for @@ -279,8 +281,8 @@ def set_tooltips_class( ---------- name : str, default None Name of the tooltip class used in CSS, should conform to HTML standards. - properties : list-like, default None - List of (attr, value) tuples; see example. + properties : list-like or str, default None + List of (attr, value) tuples or a valid CSS string; see example. Returns ------- @@ -311,6 +313,8 @@ def set_tooltips_class( ... ('visibility', 'hidden'), ... ('position', 'absolute'), ... ('z-index', 1)]) + >>> df.style.set_tooltips_class(name='tt-add', + ... properties='visibility:hidden; position:absolute; z-index:1;') """ self._init_tooltips() assert self.tooltips is not None # mypy requirement @@ -1118,7 +1122,12 @@ def set_caption(self, caption: str) -> Styler: self.caption = caption return self - def set_table_styles(self, table_styles, axis=0, overwrite=True) -> Styler: + def set_table_styles( + self, + table_styles: Union[Dict[Any, CSSStyles], CSSStyles], + axis: int = 0, + overwrite: bool = True, + ) -> Styler: """ Set the table styles on a Styler. @@ -1172,13 +1181,20 @@ def set_table_styles(self, table_styles, axis=0, overwrite=True) -> Styler: ... 'props': [('background-color', 'yellow')]}] ... ) + Or with CSS strings + + >>> df.style.set_table_styles( + ... [{'selector': 'tr:hover', + ... 'props': 'background-color: yellow; font-size: 1em;']}] + ... ) + Adding column styling by name >>> df.style.set_table_styles({ ... 'A': [{'selector': '', ... 'props': [('color', 'red')]}], ... 'B': [{'selector': 'td', - ... 'props': [('color', 'blue')]}] + ... 'props': 'color: blue;']}] ... }, overwrite=False) Adding row styling @@ -1188,7 +1204,7 @@ def set_table_styles(self, table_styles, axis=0, overwrite=True) -> Styler: ... 'props': [('font-size', '25px')]}] ... }, axis=1, overwrite=False) """ - if is_dict_like(table_styles): + if isinstance(table_styles, dict): if axis in [0, "index"]: obj, idf = self.data.columns, ".col" else: @@ -1196,12 +1212,20 @@ def set_table_styles(self, table_styles, axis=0, overwrite=True) -> Styler: table_styles = [ { - "selector": s["selector"] + idf + str(obj.get_loc(key)), - "props": s["props"], + "selector": str(s["selector"]) + idf + str(obj.get_loc(key)), + "props": _maybe_convert_css_to_tuples(s["props"]), } for key, styles in table_styles.items() for s in styles ] + else: + table_styles = [ + { + "selector": s["selector"], + "props": _maybe_convert_css_to_tuples(s["props"]), + } + for s in table_styles + ] if not overwrite and self.table_styles is not None: self.table_styles.extend(table_styles) @@ -1816,7 +1840,7 @@ class _Tooltips: def __init__( self, - css_props: Sequence[Tuple[str, Union[str, int, float]]] = [ + css_props: CSSProperties = [ ("visibility", "hidden"), ("position", "absolute"), ("z-index", 1), @@ -1830,7 +1854,7 @@ def __init__( self.class_name = css_name self.class_properties = css_props self.tt_data = tooltips - self.table_styles: List[Dict[str, Union[str, List[Tuple[str, str]]]]] = [] + self.table_styles: CSSStyles = [] @property def _class_styles(self): @@ -1843,7 +1867,12 @@ def _class_styles(self): ------- styles : List """ - return [{"selector": f".{self.class_name}", "props": self.class_properties}] + return [ + { + "selector": f".{self.class_name}", + "props": _maybe_convert_css_to_tuples(self.class_properties), + } + ] def _pseudo_css(self, uuid: str, name: str, row: int, col: int, text: str): """ @@ -2025,3 +2054,25 @@ def _maybe_wrap_formatter( else: msg = f"Expected a string, got {na_rep} instead" raise TypeError(msg) + + +def _maybe_convert_css_to_tuples(style: CSSProperties) -> CSSSequence: + """ + Convert css-string to sequence of tuples format if needed. + 'color:red; border:1px solid black;' -> [('color', 'red'), + ('border','1px solid red')] + """ + if isinstance(style, str): + s = style.split(";") + try: + return [ + (x.split(":")[0].strip(), x.split(":")[1].strip()) + for x in s + if x.strip() != "" + ] + except IndexError: + raise ValueError( + "Styles supplied as string must follow CSS rule formats, " + f"for example 'attr: val;'. {style} was given." + ) + return style diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 0d23addbb5f21..8961fd0a7af06 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -2,7 +2,19 @@ import csv import datetime import itertools -from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Union, cast +from typing import ( + Any, + DefaultDict, + Dict, + Iterable, + List, + Optional, + Sequence, + Set, + Tuple, + Union, + cast, +) import warnings import numpy as np @@ -335,9 +347,7 @@ def _maybe_dedup_names(self, names): # would be nice! if self.mangle_dupe_cols: names = list(names) # so we can index - # pandas\io\parsers.py:1559: error: Need type annotation for - # 'counts' [var-annotated] - counts = defaultdict(int) # type: ignore[var-annotated] + counts: DefaultDict[Union[int, str, Tuple], int] = defaultdict(int) is_potential_mi = _is_potential_multi_index(names, self.index_col) for i, col in enumerate(names): @@ -382,9 +392,8 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): # add names for the index if indexnamerow: coffset = len(indexnamerow) - len(columns) - # pandas\io\parsers.py:1604: error: Item "None" of "Optional[Any]" - # has no attribute "set_names" [union-attr] - index = index.set_names(indexnamerow[:coffset]) # type: ignore[union-attr] + assert index is not None + index = index.set_names(indexnamerow[:coffset]) # maybe create a mi on the columns columns = self._maybe_make_multi_index_columns(columns, self.col_names) @@ -458,9 +467,8 @@ def _agg_index(self, index, try_parse_dates=True) -> Index: col_na_fvalues = set() if isinstance(self.na_values, dict): - # pandas\io\parsers.py:1678: error: Value of type - # "Optional[Any]" is not indexable [index] - col_name = self.index_names[i] # type: ignore[index] + assert self.index_names is not None + col_name = self.index_names[i] if col_name is not None: col_na_values, col_na_fvalues = _get_na_values( col_name, self.na_values, self.na_fvalues, self.keep_default_na @@ -549,7 +557,7 @@ def _convert_to_ndarrays( return result def _set_noconvert_dtype_columns( - self, col_indices: List[int], names: List[Union[int, str]] + self, col_indices: List[int], names: List[Union[int, str, Tuple]] ) -> Set[int]: """ Set the columns that should not undergo dtype conversions. @@ -850,7 +858,7 @@ def _clean_index_names(self, columns, index_col, unnamed_cols): return [None] * len(index_col), columns, index_col cp_cols = list(columns) - index_names = [] + index_names: List[Optional[Union[int, str]]] = [] # don't mutate index_col = list(index_col) @@ -871,10 +879,7 @@ def _clean_index_names(self, columns, index_col, unnamed_cols): # Only clean index names that were placeholders. for i, name in enumerate(index_names): if isinstance(name, str) and name in unnamed_cols: - # pandas\io\parsers.py:3445: error: No overload variant of - # "__setitem__" of "list" matches argument types "int", "None" - # [call-overload] - index_names[i] = None # type: ignore[call-overload] + index_names[i] = None return index_names, columns, index_col diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 9bd3bc9fb5c62..d1d77c5e044be 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -131,12 +131,8 @@ def __init__(self, src: FilePathOrBuffer, **kwds): self.index_names = index_names if self._reader.header is None and not passed_names: - # pandas\io\parsers.py:1997: error: Argument 1 to "len" has - # incompatible type "Optional[Any]"; expected "Sized" - # [arg-type] - self.index_names = [None] * len( - self.index_names # type: ignore[arg-type] - ) + assert self.index_names is not None + self.index_names = [None] * len(self.index_names) self._implicit_index = self._reader.leading_cols > 0 diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index c005f69e3c04e..223acdea80ca6 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -3,7 +3,7 @@ from io import StringIO import re import sys -from typing import Iterator, List, Optional, Set, cast +from typing import DefaultDict, Iterator, List, Optional, Set, Tuple, cast import numpy as np @@ -118,7 +118,7 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): self.columns = self.columns[0] # get popped off for index - self.orig_names = list(self.columns) + self.orig_names: List[Union[int, str, Tuple]] = list(self.columns) # needs to be cleaned/refactored # multiple date column thing turning into a real spaghetti factory @@ -236,10 +236,7 @@ def read(self, rows=None): # done with first read, next time raise StopIteration self._first_chunk = False - # pandas\io\parsers.py:2480: error: Argument 1 to "list" has - # incompatible type "Optional[Any]"; expected "Iterable[Any]" - # [arg-type] - columns = list(self.orig_names) # type: ignore[arg-type] + columns = list(self.orig_names) if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) @@ -292,15 +289,8 @@ def _clean_mapping(mapping): """converts col numbers to names""" clean = {} for col, v in mapping.items(): - # pandas\io\parsers.py:2537: error: Unsupported right operand - # type for in ("Optional[Any]") [operator] - if ( - isinstance(col, int) - and col not in self.orig_names # type: ignore[operator] - ): - # pandas\io\parsers.py:2538: error: Value of type - # "Optional[Any]" is not indexable [index] - col = self.orig_names[col] # type: ignore[index] + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] clean[col] = v return clean @@ -320,15 +310,8 @@ def _clean_mapping(mapping): na_value = self.na_values[col] na_fvalue = self.na_fvalues[col] - # pandas\io\parsers.py:2558: error: Unsupported right operand - # type for in ("Optional[Any]") [operator] - if ( - isinstance(col, int) - and col not in self.orig_names # type: ignore[operator] - ): - # pandas\io\parsers.py:2559: error: Value of type - # "Optional[Any]" is not indexable [index] - col = self.orig_names[col] # type: ignore[index] + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] clean_na_values[col] = na_value clean_na_fvalues[col] = na_fvalue @@ -349,10 +332,7 @@ def _infer_columns(self): names = self.names num_original_columns = 0 clear_buffer = True - # pandas\io\parsers.py:2580: error: Need type annotation for - # 'unnamed_cols' (hint: "unnamed_cols: Set[] = ...") - # [var-annotated] - unnamed_cols = set() # type: ignore[var-annotated] + unnamed_cols: Set[Optional[Union[int, str]]] = set() if self.header is not None: header = self.header @@ -366,9 +346,7 @@ def _infer_columns(self): have_mi_columns = False header = [header] - # pandas\io\parsers.py:2594: error: Need type annotation for - # 'columns' (hint: "columns: List[] = ...") [var-annotated] - columns = [] # type: ignore[var-annotated] + columns: List[List[Optional[Union[int, str]]]] = [] for level, hr in enumerate(header): try: line = self._buffered_line() @@ -397,7 +375,7 @@ def _infer_columns(self): line = self.names[:] - this_columns = [] + this_columns: List[Optional[Union[int, str]]] = [] this_unnamed_cols = [] for i, c in enumerate(line): @@ -413,9 +391,7 @@ def _infer_columns(self): this_columns.append(c) if not have_mi_columns and self.mangle_dupe_cols: - # pandas\io\parsers.py:2639: error: Need type annotation - # for 'counts' [var-annotated] - counts = defaultdict(int) # type: ignore[var-annotated] + counts: DefaultDict = defaultdict(int) for i, col in enumerate(this_columns): cur_count = counts[col] @@ -439,16 +415,10 @@ def _infer_columns(self): if lc != unnamed_count and lc - ic > unnamed_count: clear_buffer = False - # pandas\io\parsers.py:2663: error: List item 0 has - # incompatible type "None"; expected "str" - # [list-item] - this_columns = [None] * lc # type: ignore[list-item] + this_columns = [None] * lc self.buf = [self.buf[-1]] - # pandas\io\parsers.py:2666: error: Argument 1 to "append" of - # "list" has incompatible type "List[str]"; expected - # "List[None]" [arg-type] - columns.append(this_columns) # type: ignore[arg-type] + columns.append(this_columns) unnamed_cols.update({this_columns[i] for i in this_unnamed_cols}) if len(columns) == 1: @@ -490,19 +460,9 @@ def _infer_columns(self): if not names: if self.prefix: - # pandas\io\parsers.py:2711: error: List comprehension has - # incompatible type List[str]; expected List[None] [misc] - columns = [ - [ - f"{self.prefix}{i}" # type: ignore[misc] - for i in range(ncols) - ] - ] + columns = [[f"{self.prefix}{i}" for i in range(ncols)]] else: - # pandas\io\parsers.py:2713: error: Argument 1 to "list" - # has incompatible type "range"; expected "Iterable[None]" - # [arg-type] - columns = [list(range(ncols))] # type: ignore[arg-type] + columns = [list(range(ncols))] columns = self._handle_usecols(columns, columns[0]) else: if self.usecols is None or len(names) >= num_original_columns: diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 37814a5debf2e..dc45336bb4c0f 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1,12 +1,11 @@ """ Module contains tools for processing files into DataFrames or other objects """ - from collections import abc import csv import sys from textwrap import fill -from typing import Any, Dict, Optional, Set, Type +from typing import Any, Dict, List, Optional, Set, Type import warnings import numpy as np @@ -270,6 +269,11 @@ Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python standard encodings `_ . + .. versionchanged:: 1.2 + + When ``encoding`` is ``None``, ``errors="replace"`` is passed to + ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``. + This behavior was previously only the case for ``engine="python"``. dialect : str or csv.Dialect, optional If provided, this parameter will override values (default or not) for the following parameters: `delimiter`, `doublequote`, `escapechar`, @@ -279,7 +283,7 @@ error_bad_lines : bool, default True Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. - If False, then these "bad lines" will dropped from the DataFrame that is + If False, then these "bad lines" will be dropped from the DataFrame that is returned. warn_bad_lines : bool, default True If error_bad_lines is False, and warn_bad_lines is True, a warning for each @@ -722,6 +726,7 @@ def _get_options_with_defaults(self, engine): kwds = self.orig_options options = {} + default: Optional[object] for argname, default in parser_defaults.items(): value = kwds.get(argname, default) @@ -751,10 +756,7 @@ def _get_options_with_defaults(self, engine): options[argname] = value if engine == "python-fwf": - # pandas\io\parsers.py:907: error: Incompatible types in assignment - # (expression has type "object", variable has type "Union[int, str, - # None]") [assignment] - for argname, default in _fwf_defaults.items(): # type: ignore[assignment] + for argname, default in _fwf_defaults.items(): options[argname] = kwds.get(argname, default) return options @@ -1048,15 +1050,13 @@ def TextParser(*args, **kwds): def _clean_na_values(na_values, keep_default_na=True): - + na_fvalues: Union[Set, Dict] if na_values is None: if keep_default_na: na_values = STR_NA_VALUES else: na_values = set() - # pandas\io\parsers.py:3387: error: Need type annotation for - # 'na_fvalues' (hint: "na_fvalues: Set[] = ...") [var-annotated] - na_fvalues = set() # type: ignore[var-annotated] + na_fvalues = set() elif isinstance(na_values, dict): old_na_values = na_values.copy() na_values = {} # Prevent aliasing. @@ -1073,12 +1073,7 @@ def _clean_na_values(na_values, keep_default_na=True): v = set(v) | STR_NA_VALUES na_values[k] = v - # pandas\io\parsers.py:3404: error: Incompatible types in assignment - # (expression has type "Dict[Any, Any]", variable has type "Set[Any]") - # [assignment] - na_fvalues = { # type: ignore[assignment] - k: _floatify_na_values(v) for k, v in na_values.items() - } + na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} else: if not is_list_like(na_values): na_values = [na_values] @@ -1106,7 +1101,7 @@ def _floatify_na_values(na_values): def _stringify_na_values(na_values): """ return a stringified and numeric for these values """ - result = [] + result: List[Union[int, str, float]] = [] for x in na_values: result.append(str(x)) result.append(x) @@ -1119,15 +1114,11 @@ def _stringify_na_values(na_values): result.append(f"{v}.0") result.append(str(v)) - # pandas\io\parsers.py:3522: error: Argument 1 to "append" of - # "list" has incompatible type "float"; expected "str" [arg-type] - result.append(v) # type: ignore[arg-type] + result.append(v) except (TypeError, ValueError, OverflowError): pass try: - # pandas\io\parsers.py:3526: error: Argument 1 to "append" of - # "list" has incompatible type "int"; expected "str" [arg-type] - result.append(int(x)) # type: ignore[arg-type] + result.append(int(x)) except (TypeError, ValueError, OverflowError): pass return set(result) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 9a5c9e4a2e2b2..8f8c435fae4f3 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -98,6 +98,19 @@ Return StataReader object for iterations, returns chunks with given number of lines.""" +_compression_params = f"""\ +compression : str or dict, default None + If string, specifies compression mode. If dict, value at key 'method' + specifies compression mode. Compression mode must be one of {{'infer', + 'gzip', 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' + and `filepath_or_buffer` is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise + no compression). If dict and compression mode is one of + {{'zip', 'gzip', 'bz2'}}, or inferred as one of the above, + other entries passed as additional compression options. +{generic._shared_docs["storage_options"]}""" + + _iterator_params = """\ iterator : bool, default False Return StataReader object.""" @@ -129,6 +142,7 @@ {_statafile_processing_params2} {_chunksize_params} {_iterator_params} +{_compression_params} Returns ------- @@ -180,6 +194,7 @@ {_statafile_processing_params1} {_statafile_processing_params2} {_chunksize_params} +{_compression_params} {_reader_notes} """ @@ -1026,6 +1041,7 @@ def __init__( columns: Optional[Sequence[str]] = None, order_categoricals: bool = True, chunksize: Optional[int] = None, + compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ): super().__init__() @@ -1064,10 +1080,10 @@ def __init__( "rb", storage_options=storage_options, is_text=False, + compression=compression, ) as handles: # Copy to BytesIO, and ensure no encoding - contents = handles.handle.read() - self.path_or_buf = BytesIO(contents) # type: ignore[arg-type] + self.path_or_buf = BytesIO(handles.handle.read()) # type: ignore[arg-type] self._read_header() self._setup_dtype() @@ -1898,6 +1914,7 @@ def read_stata( order_categoricals: bool = True, chunksize: Optional[int] = None, iterator: bool = False, + compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> Union[DataFrame, StataReader]: @@ -1912,6 +1929,7 @@ def read_stata( order_categoricals=order_categoricals, chunksize=chunksize, storage_options=storage_options, + compression=compression, ) if iterator or chunksize: diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py index 31423c03dee34..71804bded3e44 100644 --- a/pandas/tests/api/test_types.py +++ b/pandas/tests/api/test_types.py @@ -1,7 +1,6 @@ import pandas._testing as tm from pandas.api import types - -from .test_api import Base +from pandas.tests.api.test_api import Base class TestTypes(Base): diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 577b8dec1181d..4a2e8ba8219aa 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -14,8 +14,7 @@ import pandas._testing as tm from pandas.core import ops from pandas.core.arrays import TimedeltaArray - -from .common import assert_invalid_comparison +from pandas.tests.arithmetic.common import assert_invalid_comparison # ------------------------------------------------------------------ # Comparisons diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index c4afe971d533e..740ec3be4a1c6 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -31,14 +31,23 @@ def assert_dtype(obj, expected_dtype): """ Helper to check the dtype for a Series, Index, or single-column DataFrame. """ - if isinstance(obj, DataFrame): - dtype = obj.dtypes.iat[0] - else: - dtype = obj.dtype + dtype = tm.get_dtype(obj) assert dtype == expected_dtype +def get_expected_name(box, names): + if box is DataFrame: + # Since we are operating with a DataFrame and a non-DataFrame, + # the non-DataFrame is cast to Series and its name ignored. + exname = names[0] + elif box in [tm.to_array, pd.array]: + exname = names[1] + else: + exname = names[2] + return exname + + # ------------------------------------------------------------------ # Timedelta64[ns] dtype Comparisons @@ -1212,19 +1221,12 @@ def test_td64arr_add_sub_tdi(self, box_with_array, names): # GH#17250 make sure result dtype is correct # GH#19043 make sure names are propagated correctly box = box_with_array + exname = get_expected_name(box, names) - if box is pd.DataFrame and names[1] != names[0]: - pytest.skip( - "Name propagation for DataFrame does not behave like " - "it does for Index/Series" - ) - - tdi = TimedeltaIndex(["0 days", "1 day"], name=names[0]) + tdi = TimedeltaIndex(["0 days", "1 day"], name=names[1]) tdi = np.array(tdi) if box in [tm.to_array, pd.array] else tdi - ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[1]) - expected = Series( - [Timedelta(hours=3), Timedelta(days=1, hours=4)], name=names[2] - ) + ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[0]) + expected = Series([Timedelta(hours=3), Timedelta(days=1, hours=4)], name=exname) ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) @@ -1238,7 +1240,7 @@ def test_td64arr_add_sub_tdi(self, box_with_array, names): assert_dtype(result, "timedelta64[ns]") expected = Series( - [Timedelta(hours=-3), Timedelta(days=1, hours=-4)], name=names[2] + [Timedelta(hours=-3), Timedelta(days=1, hours=-4)], name=exname ) expected = tm.box_expected(expected, box) @@ -1318,19 +1320,14 @@ def test_td64arr_sub_timedeltalike(self, two_hours, box_with_array): def test_td64arr_add_offset_index(self, names, box_with_array): # GH#18849, GH#19744 box = box_with_array - - if box is pd.DataFrame and names[1] != names[0]: - pytest.skip( - "Name propagation for DataFrame does not behave like " - "it does for Index/Series" - ) + exname = get_expected_name(box, names) tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) other = np.array(other) if box in [tm.to_array, pd.array] else other expected = TimedeltaIndex( - [tdi[n] + other[n] for n in range(len(tdi))], freq="infer", name=names[2] + [tdi[n] + other[n] for n in range(len(tdi))], freq="infer", name=exname ) tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) @@ -1370,13 +1367,7 @@ def test_td64arr_sub_offset_index(self, names, box_with_array): # GH#18824, GH#19744 box = box_with_array xbox = box if box not in [tm.to_array, pd.array] else pd.Index - exname = names[2] if box not in [tm.to_array, pd.array] else names[1] - - if box is pd.DataFrame and names[1] != names[0]: - pytest.skip( - "Name propagation for DataFrame does not behave like " - "it does for Index/Series" - ) + exname = get_expected_name(box, names) tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) @@ -1412,15 +1403,7 @@ def test_td64arr_with_offset_series(self, names, box_with_array): # GH#18849 box = box_with_array box2 = Series if box in [pd.Index, tm.to_array, pd.array] else box - - if box is pd.DataFrame: - # Since we are operating with a DataFrame and a non-DataFrame, - # the non-DataFrame is cast to Series and its name ignored. - exname = names[0] - elif box in [tm.to_array, pd.array]: - exname = names[1] - else: - exname = names[2] + exname = get_expected_name(box, names) tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) other = Series([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) @@ -2100,11 +2083,7 @@ def test_td64arr_div_numeric_array(self, box_with_array, vector, any_real_dtype) def test_td64arr_mul_int_series(self, box_with_array, names, request): # GH#19042 test for correct name attachment box = box_with_array - if box_with_array is pd.DataFrame and names[2] is None: - reason = "broadcasts along wrong axis, but doesn't raise" - request.node.add_marker(pytest.mark.xfail(reason=reason)) - - exname = names[2] if box not in [tm.to_array, pd.array] else names[1] + exname = get_expected_name(box, names) tdi = TimedeltaIndex( ["0days", "1day", "2days", "3days", "4days"], name=names[0] @@ -2119,11 +2098,8 @@ def test_td64arr_mul_int_series(self, box_with_array, names, request): ) tdi = tm.box_expected(tdi, box) - xbox = ( - Series - if (box is pd.Index or box is tm.to_array or box is pd.array) - else box - ) + xbox = get_upcast_box(box, ser) + expected = tm.box_expected(expected, xbox) result = ser * tdi @@ -2154,9 +2130,7 @@ def test_float_series_rdiv_td64arr(self, box_with_array, names): name=xname, ) - xbox = box - if box in [pd.Index, tm.to_array, pd.array] and type(ser) is Series: - xbox = Series + xbox = get_upcast_box(box, ser) tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, xbox) diff --git a/pandas/tests/arrays/categorical/test_warnings.py b/pandas/tests/arrays/categorical/test_warnings.py index 9e164a250cdb1..6ba3347796e08 100644 --- a/pandas/tests/arrays/categorical/test_warnings.py +++ b/pandas/tests/arrays/categorical/test_warnings.py @@ -15,15 +15,8 @@ async def test_tab_complete_warning(self, ip): code = "import pandas as pd; c = Categorical([])" await ip.run_code(code) - # GH 31324 newer jedi version raises Deprecation warning - import jedi - - if jedi.__version__ < "0.16.0": - warning = tm.assert_produces_warning(None) - else: - warning = tm.assert_produces_warning( - DeprecationWarning, check_stacklevel=False - ) - with warning: + # GH 31324 newer jedi version raises Deprecation warning; + # appears resolved 2021-02-02 + with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("c.", 1)) diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 8cd0d29a34ec8..58fedbd3e4231 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -14,8 +14,8 @@ ("float", np.nan), ("bool", False), ("object", np.nan), - ("datetime64[ns]", pd.NaT), - ("timedelta64[ns]", pd.NaT), + ("datetime64[ns]", np.datetime64("NaT", "ns")), + ("timedelta64[ns]", np.timedelta64("NaT", "ns")), ], ) def test_inferred_dtype(dtype, fill_value): diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 86c4b4c5ce63d..d159d76030250 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -172,7 +172,7 @@ def test_value_counts_preserves_tz(self): assert result.index.equals(dti) arr[-2] = pd.NaT - result = arr.value_counts() + result = arr.value_counts(dropna=False) expected = pd.Series([4, 2, 1], index=[dti[0], dti[1], pd.NaT]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index 53d53e35c6eb5..a749955d35494 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -3,83 +3,10 @@ import pandas.core.dtypes.concat as _concat import pandas as pd -from pandas import DatetimeIndex, Period, PeriodIndex, Series, TimedeltaIndex +from pandas import Series import pandas._testing as tm -@pytest.mark.parametrize( - "to_concat, expected", - [ - # int/float/str - ([["a"], [1, 2]], ["i", "object"]), - ([[3, 4], [1, 2]], ["i"]), - ([[3, 4], [1, 2.1]], ["i", "f"]), - # datetimelike - ([DatetimeIndex(["2011-01-01"]), DatetimeIndex(["2011-01-02"])], ["datetime"]), - ([TimedeltaIndex(["1 days"]), TimedeltaIndex(["2 days"])], ["timedelta"]), - # datetimelike object - ( - [ - DatetimeIndex(["2011-01-01"]), - DatetimeIndex(["2011-01-02"], tz="US/Eastern"), - ], - ["datetime", "datetime64[ns, US/Eastern]"], - ), - ( - [ - DatetimeIndex(["2011-01-01"], tz="Asia/Tokyo"), - DatetimeIndex(["2011-01-02"], tz="US/Eastern"), - ], - ["datetime64[ns, Asia/Tokyo]", "datetime64[ns, US/Eastern]"], - ), - ([TimedeltaIndex(["1 days"]), TimedeltaIndex(["2 hours"])], ["timedelta"]), - ( - [ - DatetimeIndex(["2011-01-01"], tz="Asia/Tokyo"), - TimedeltaIndex(["1 days"]), - ], - ["datetime64[ns, Asia/Tokyo]", "timedelta"], - ), - ], -) -def test_get_dtype_kinds(index_or_series, to_concat, expected): - to_concat_klass = [index_or_series(c) for c in to_concat] - result = _concat._get_dtype_kinds(to_concat_klass) - assert result == set(expected) - - -@pytest.mark.parametrize( - "to_concat, expected", - [ - ( - [PeriodIndex(["2011-01"], freq="M"), PeriodIndex(["2011-01"], freq="M")], - ["period[M]"], - ), - ( - [ - Series([Period("2011-01", freq="M")]), - Series([Period("2011-02", freq="M")]), - ], - ["period[M]"], - ), - ( - [PeriodIndex(["2011-01"], freq="M"), PeriodIndex(["2011-01"], freq="D")], - ["period[M]", "period[D]"], - ), - ( - [ - Series([Period("2011-01", freq="M")]), - Series([Period("2011-02", freq="D")]), - ], - ["period[M]", "period[D]"], - ), - ], -) -def test_get_dtype_kinds_period(to_concat, expected): - result = _concat._get_dtype_kinds(to_concat) - assert result == set(expected) - - def test_concat_mismatched_categoricals_with_empty(): # concat_compat behavior on series._values should match pd.concat on series ser1 = Series(["a", "b", "c"], dtype="category") diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index c02185dd82043..0d92ef02e07c8 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -472,8 +472,8 @@ def test_array_equivalent_nested(): "dtype, na_value", [ # Datetime-like - (np.dtype("M8[ns]"), NaT), - (np.dtype("m8[ns]"), NaT), + (np.dtype("M8[ns]"), np.datetime64("NaT", "ns")), + (np.dtype("m8[ns]"), np.timedelta64("NaT", "ns")), (DatetimeTZDtype.construct_from_string("datetime64[ns, US/Eastern]"), NaT), (PeriodDtype("M"), NaT), # Integer @@ -499,7 +499,11 @@ def test_array_equivalent_nested(): ) def test_na_value_for_dtype(dtype, na_value): result = na_value_for_dtype(dtype) - assert result is na_value + # identify check doesnt work for datetime64/timedelta64("NaT") bc they + # are not singletons + assert result is na_value or ( + isna(result) and isna(na_value) and type(result) is type(na_value) + ) class TestNAObj: diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 603216a0b5bbb..829be279b45d3 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -8,7 +8,10 @@ pytest.importorskip("pyarrow", minversion="0.13.0") -from .arrays import ArrowBoolArray, ArrowBoolDtype # isort:skip +from pandas.tests.extension.arrow.arrays import ( # isort:skip + ArrowBoolArray, + ArrowBoolDtype, +) @pytest.fixture diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py index abd5c1f386dc5..23a07b2031bf5 100644 --- a/pandas/tests/extension/arrow/test_string.py +++ b/pandas/tests/extension/arrow/test_string.py @@ -4,7 +4,7 @@ pytest.importorskip("pyarrow", minversion="0.13.0") -from .arrays import ArrowStringDtype # isort:skip +from pandas.tests.extension.arrow.arrays import ArrowStringDtype # isort:skip def test_constructor_from_list(): diff --git a/pandas/tests/extension/arrow/test_timestamp.py b/pandas/tests/extension/arrow/test_timestamp.py index bd661ad20bb02..10e560b34a21c 100644 --- a/pandas/tests/extension/arrow/test_timestamp.py +++ b/pandas/tests/extension/arrow/test_timestamp.py @@ -12,7 +12,7 @@ import pyarrow as pa # isort:skip -from .arrays import ArrowExtensionArray # isort:skip +from pandas.tests.extension.arrow.arrays import ArrowExtensionArray # isort:skip @register_extension_dtype diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 323cb843b2d74..9cf3bdab40d0b 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -41,26 +41,26 @@ class TestMyDtype(BaseDtypeTests): ``assert_series_equal`` on your base test class. """ -from .casting import BaseCastingTests # noqa -from .constructors import BaseConstructorsTests # noqa -from .dtype import BaseDtypeTests # noqa -from .getitem import BaseGetitemTests # noqa -from .groupby import BaseGroupbyTests # noqa -from .interface import BaseInterfaceTests # noqa -from .io import BaseParsingTests # noqa -from .methods import BaseMethodsTests # noqa -from .missing import BaseMissingTests # noqa -from .ops import ( # noqa +from pandas.tests.extension.base.casting import BaseCastingTests # noqa +from pandas.tests.extension.base.constructors import BaseConstructorsTests # noqa +from pandas.tests.extension.base.dtype import BaseDtypeTests # noqa +from pandas.tests.extension.base.getitem import BaseGetitemTests # noqa +from pandas.tests.extension.base.groupby import BaseGroupbyTests # noqa +from pandas.tests.extension.base.interface import BaseInterfaceTests # noqa +from pandas.tests.extension.base.io import BaseParsingTests # noqa +from pandas.tests.extension.base.methods import BaseMethodsTests # noqa +from pandas.tests.extension.base.missing import BaseMissingTests # noqa +from pandas.tests.extension.base.ops import ( # noqa BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil, BaseUnaryOpsTests, ) -from .printing import BasePrintingTests # noqa -from .reduce import ( # noqa +from pandas.tests.extension.base.printing import BasePrintingTests # noqa +from pandas.tests.extension.base.reduce import ( # noqa BaseBooleanReduceTests, BaseNoReduceTests, BaseNumericReduceTests, ) -from .reshaping import BaseReshapingTests # noqa -from .setitem import BaseSetitemTests # noqa +from pandas.tests.extension.base.reshaping import BaseReshapingTests # noqa +from pandas.tests.extension.base.setitem import BaseSetitemTests # noqa diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 039b42210224e..0b79a5368a542 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -3,8 +3,7 @@ import pandas as pd from pandas.core.internals import ObjectBlock - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseCastingTests(BaseExtensionTests): diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 9dbfd2a5589c0..6f0d8d16a0224 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -3,8 +3,7 @@ import pandas as pd from pandas.core.internals import ExtensionBlock - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseConstructorsTests(BaseExtensionTests): diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 128e0a9f81e91..5e4d23e91925a 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -5,8 +5,7 @@ import pandas as pd from pandas.api.types import is_object_dtype, is_string_dtype - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseDtypeTests(BaseExtensionTests): diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index bfd6da0fc864d..286ed9c736f31 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -2,8 +2,7 @@ import pytest import pandas as pd - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseGetitemTests(BaseExtensionTests): diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index c81304695f353..30b115b9dba6f 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -2,8 +2,7 @@ import pandas as pd import pandas._testing as tm - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseGroupbyTests(BaseExtensionTests): diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 6a4ff68b4580f..05a28f20b956a 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -5,8 +5,7 @@ import pandas as pd import pandas._testing as tm - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseInterfaceTests(BaseExtensionTests): diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py index 3de752a8c682a..a8c25db3181d0 100644 --- a/pandas/tests/extension/base/io.py +++ b/pandas/tests/extension/base/io.py @@ -4,8 +4,7 @@ import pytest import pandas as pd - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseParsingTests(BaseExtensionTests): diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 5906221389b35..bf5e9fe009cd1 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -1,3 +1,4 @@ +import inspect import operator import numpy as np @@ -8,13 +9,20 @@ import pandas as pd import pandas._testing as tm from pandas.core.sorting import nargsort - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseMethodsTests(BaseExtensionTests): """Various Series and DataFrame methods.""" + def test_value_counts_default_dropna(self, data): + # make sure we have consistent default dropna kwarg + if not hasattr(data, "value_counts"): + pytest.skip("value_counts is not implemented") + sig = inspect.signature(data.value_counts) + kwarg = sig.parameters["dropna"] + assert kwarg.default is True + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): all_data = all_data[:10] diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index a5969ef961bab..0cf03533915f2 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -2,8 +2,7 @@ import pandas as pd import pandas._testing as tm - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseMissingTests(BaseExtensionTests): diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 5e00d7530d413..bae8e9df72d41 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -5,8 +5,7 @@ import pandas as pd import pandas._testing as tm from pandas.core import ops - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseOpsUtil(BaseExtensionTests): diff --git a/pandas/tests/extension/base/printing.py b/pandas/tests/extension/base/printing.py index ad34a83c7cf71..eab75be66080f 100644 --- a/pandas/tests/extension/base/printing.py +++ b/pandas/tests/extension/base/printing.py @@ -3,8 +3,7 @@ import pytest import pandas as pd - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BasePrintingTests(BaseExtensionTests): diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 55f8aca1b8ae0..0f7bd59411eb5 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -4,8 +4,7 @@ import pandas as pd import pandas._testing as tm - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseReduceTests(BaseExtensionTests): diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 44e3fc1eb56d8..18f6084f989dc 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -5,8 +5,7 @@ import pandas as pd from pandas.core.internals import ExtensionBlock - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseReshapingTests(BaseExtensionTests): diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 9ec842d801919..16b9b8e8efdea 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -3,8 +3,7 @@ import pandas as pd import pandas._testing as tm - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseSetitemTests(BaseExtensionTests): diff --git a/pandas/tests/extension/decimal/__init__.py b/pandas/tests/extension/decimal/__init__.py index 8194327f8812e..34727b43a7b0f 100644 --- a/pandas/tests/extension/decimal/__init__.py +++ b/pandas/tests/extension/decimal/__init__.py @@ -1,3 +1,8 @@ -from .array import DecimalArray, DecimalDtype, make_data, to_decimal +from pandas.tests.extension.decimal.array import ( + DecimalArray, + DecimalDtype, + make_data, + to_decimal, +) __all__ = ["DecimalArray", "DecimalDtype", "to_decimal", "make_data"] diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 4122fcaae496b..c7976c5800173 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -230,7 +230,7 @@ def convert_values(param): return np.asarray(res, dtype=bool) - def value_counts(self, dropna: bool = False): + def value_counts(self, dropna: bool = True): from pandas.core.algorithms import value_counts return value_counts(self.to_numpy(), dropna=dropna) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 63980b628b8d2..16278ec1ccc53 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -8,8 +8,12 @@ import pandas as pd import pandas._testing as tm from pandas.tests.extension import base - -from .array import DecimalArray, DecimalDtype, make_data, to_decimal +from pandas.tests.extension.decimal.array import ( + DecimalArray, + DecimalDtype, + make_data, + to_decimal, +) @pytest.fixture diff --git a/pandas/tests/extension/json/__init__.py b/pandas/tests/extension/json/__init__.py index e205c7ee50974..b6402b6c09526 100644 --- a/pandas/tests/extension/json/__init__.py +++ b/pandas/tests/extension/json/__init__.py @@ -1,3 +1,3 @@ -from .array import JSONArray, JSONDtype, make_data +from pandas.tests.extension.json.array import JSONArray, JSONDtype, make_data __all__ = ["JSONArray", "JSONDtype", "make_data"] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 30b2ee390bf1a..90a39f3b33e95 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -6,8 +6,7 @@ import pandas as pd import pandas._testing as tm from pandas.tests.extension import base - -from .array import JSONArray, JSONDtype, make_data +from pandas.tests.extension.json.array import JSONArray, JSONDtype, make_data @pytest.fixture diff --git a/pandas/tests/extension/list/__init__.py b/pandas/tests/extension/list/__init__.py index 108f1937d07d3..1cd85657e0de4 100644 --- a/pandas/tests/extension/list/__init__.py +++ b/pandas/tests/extension/list/__init__.py @@ -1,3 +1,3 @@ -from .array import ListArray, ListDtype, make_data +from pandas.tests.extension.list.array import ListArray, ListDtype, make_data __all__ = ["ListArray", "ListDtype", "make_data"] diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py index c5c4417155562..832bdf5bea3cf 100644 --- a/pandas/tests/extension/list/test_list.py +++ b/pandas/tests/extension/list/test_list.py @@ -1,8 +1,7 @@ import pytest import pandas as pd - -from .array import ListArray, ListDtype, make_data +from pandas.tests.extension.list.array import ListArray, ListDtype, make_data @pytest.fixture diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 9cea274a118c0..10e82a8c9bff1 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -117,9 +117,7 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - @pytest.mark.xfail(reason="Deliberately upcast to object?") - def test_concat_with_reindex(self, data): - super().test_concat_with_reindex(data) + pass class TestGetitem(base.BaseGetitemTests): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 07b574af2ef62..67bc9f3f58daa 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -21,8 +21,7 @@ import pandas as pd import pandas._testing as tm from pandas.core.arrays.numpy_ import PandasArray - -from . import base +from pandas.tests.extension import base @pytest.fixture(params=["float", "object"]) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 3b1a8ebcb13d0..6808ffe65e561 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1732,34 +1732,6 @@ def test_setitem(self, uint64_frame): ) -@pytest.mark.parametrize( - "src_idx", - [ - Index([]), - pd.CategoricalIndex([]), - ], -) -@pytest.mark.parametrize( - "cat_idx", - [ - # No duplicates - Index([]), - pd.CategoricalIndex([]), - Index(["A", "B"]), - pd.CategoricalIndex(["A", "B"]), - # Duplicates: GH#38906 - Index(["A", "A"]), - pd.CategoricalIndex(["A", "A"]), - ], -) -def test_reindex_empty(src_idx, cat_idx): - df = DataFrame(columns=src_idx, index=["K"], dtype="f8") - - result = df.reindex(columns=cat_idx) - expected = DataFrame(index=["K"], columns=cat_idx, dtype="f8") - tm.assert_frame_equal(result, expected) - - def test_object_casting_indexing_wraps_datetimelike(): # GH#31649, check the indexing methods all the way down the stack df = DataFrame( diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index df98c78e78fb6..46f5a20f38941 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -651,3 +651,8 @@ def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture, request) # For non-NA values, we should match what we get for non-EA str alt = obj.astype(str) assert np.all(alt.iloc[1:] == result.iloc[1:]) + + def test_astype_bytes(self): + # GH#39474 + result = DataFrame(["foo", "bar", "baz"]).astype(bytes) + assert result.dtypes[0] == np.dtype("S3") diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index c49375758345c..9116b1ff5ad65 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -151,7 +151,7 @@ def test_reindex_methods_nearest_special(self): def test_reindex_nearest_tz(self, tz_aware_fixture): # GH26683 tz = tz_aware_fixture - idx = pd.date_range("2019-01-01", periods=5, tz=tz) + idx = date_range("2019-01-01", periods=5, tz=tz) df = DataFrame({"x": list(range(5))}, index=idx) expected = df.head(3) @@ -759,7 +759,7 @@ def test_reindex_multi(self): def test_reindex_multi_categorical_time(self): # https://github.com/pandas-dev/pandas/issues/21390 - midx = pd.MultiIndex.from_product( + midx = MultiIndex.from_product( [ Categorical(["a", "b", "c"]), Categorical(date_range("2012-01-01", periods=3, freq="H")), @@ -906,3 +906,30 @@ def test_reindex_empty_frame(self, kwargs): result = df.reindex(idx, **kwargs) expected = DataFrame({"a": [pd.NA] * 3}, index=idx) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "src_idx", + [ + Index([]), + CategoricalIndex([]), + ], + ) + @pytest.mark.parametrize( + "cat_idx", + [ + # No duplicates + Index([]), + CategoricalIndex([]), + Index(["A", "B"]), + CategoricalIndex(["A", "B"]), + # Duplicates: GH#38906 + Index(["A", "A"]), + CategoricalIndex(["A", "A"]), + ], + ) + def test_reindex_empty(self, src_idx, cat_idx): + df = DataFrame(columns=src_idx, index=["K"], dtype="f8") + + result = df.reindex(columns=cat_idx) + expected = DataFrame(index=["K"], columns=cat_idx, dtype="f8") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 86c9e2f5ffe52..053684ba08484 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -579,6 +579,14 @@ def test_sort_values_item_cache(self, using_array_manager): assert df.iloc[0, 0] == df["A"][0] + def test_sort_values_reshaping(self): + # GH 39426 + values = list(range(21)) + expected = DataFrame([values], columns=values) + df = expected.sort_values(expected.index[0], axis=1, ignore_index=True) + + tm.assert_frame_equal(df, expected) + class TestDataFrameSortKey: # test key sorting (issue 27237) def test_sort_values_inplace_key(self, sort_by_key): diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index a7e2fa760b7e4..29a2d9c17202e 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -274,17 +274,9 @@ async def test_tab_complete_warning(self, ip, frame_or_series): await ip.run_code(code) - # TODO: remove it when Ipython updates - # GH 33567, jedi version raises Deprecation warning in Ipython - import jedi - - if jedi.__version__ < "0.17.0": - warning = tm.assert_produces_warning(None) - else: - warning = tm.assert_produces_warning( - DeprecationWarning, check_stacklevel=False - ) - with warning: + # GH 31324 newer jedi version raises Deprecation warning; + # appears resolved 2021-02-02 + with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("obj.", 1)) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index e8ae9f6584ad6..81e10d276e79c 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1931,3 +1931,25 @@ def test_unstack_with_level_has_nan(self): ) tm.assert_index_equal(result, expected) + + def test_stack_nan_in_multiindex_columns(self): + # GH#39481 + df = DataFrame( + np.zeros([1, 5]), + columns=MultiIndex.from_tuples( + [ + (0, None, None), + (0, 2, 0), + (0, 2, 1), + (0, 3, 0), + (0, 3, 1), + ], + ), + ) + result = df.stack(2) + expected = DataFrame( + [[0.0, np.nan, np.nan], [np.nan, 0.0, 0.0], [np.nan, 0.0, 0.0]], + index=Index([(0, None), (0, 0), (0, 1)]), + columns=Index([(0, None), (0, 2), (0, 3)]), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 757f71730819d..194b8bdd4715e 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -7,8 +7,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range import pandas._testing as tm - -from .test_generic import Generic +from pandas.tests.generic.test_generic import Generic class TestDataFrame(Generic): diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index 474661e0f2e0a..38ab8d333e880 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -6,8 +6,7 @@ import pandas as pd from pandas import MultiIndex, Series, date_range import pandas._testing as tm - -from .test_generic import Generic +from pandas.tests.generic.test_generic import Generic class TestSeries(Generic): diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 2119600887ba4..75b3a6ece0b21 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -7,8 +7,7 @@ from pandas import Categorical import pandas._testing as tm from pandas.core.indexes.api import CategoricalIndex, Index - -from ..common import Base +from pandas.tests.indexes.common import Base class TestCategoricalIndex(Base): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 96fc85fcf4ae6..c70401ac14e7d 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -32,7 +32,6 @@ class Base: """ base class for index sub-class tests """ _holder: Type[Index] - _compat_props = ["shape", "ndim", "size", "nbytes"] def create_index(self) -> Index: raise NotImplementedError("Method not implemented") @@ -191,29 +190,6 @@ def test_logical_compat(self): with pytest.raises(TypeError, match="cannot perform any"): idx.any() - def test_reindex_base(self): - idx = self.create_index() - expected = np.arange(idx.size, dtype=np.intp) - - actual = idx.get_indexer(idx) - tm.assert_numpy_array_equal(expected, actual) - - with pytest.raises(ValueError, match="Invalid fill method"): - idx.get_indexer(idx, method="invalid") - - def test_ndarray_compat_properties(self): - idx = self.create_index() - assert idx.T.equals(idx) - assert idx.transpose().equals(idx) - - values = idx.values - for prop in self._compat_props: - assert getattr(idx, prop) == getattr(values, prop) - - # test for validity - idx.nbytes - idx.values.nbytes - def test_repr_roundtrip(self): idx = self.create_index() @@ -654,7 +630,7 @@ def test_map(self): def test_map_dictlike(self, mapper): index = self.create_index() - if isinstance(index, (pd.CategoricalIndex, pd.IntervalIndex)): + if isinstance(index, pd.CategoricalIndex): pytest.skip(f"skipping tests for {type(index)}") identity = mapper(index.values, index) @@ -681,21 +657,6 @@ def test_map_str(self): expected = Index([str(x) for x in index], dtype=object) tm.assert_index_equal(result, expected) - def test_putmask_with_wrong_mask(self): - # GH18368 - index = self.create_index() - fill = index[0] - - msg = "putmask: mask and data must be the same size" - with pytest.raises(ValueError, match=msg): - index.putmask(np.ones(len(index) + 1, np.bool_), fill) - - with pytest.raises(ValueError, match=msg): - index.putmask(np.ones(len(index) - 1, np.bool_), fill) - - with pytest.raises(ValueError, match=msg): - index.putmask("foo", fill) - @pytest.mark.parametrize("copy", [True, False]) @pytest.mark.parametrize("name", [None, "foo"]) @pytest.mark.parametrize("ordered", [True, False]) @@ -760,25 +721,6 @@ def test_getitem_2d_deprecated(self): assert isinstance(res, np.ndarray), type(res) - def test_contains_requires_hashable_raises(self): - idx = self.create_index() - - msg = "unhashable type: 'list'" - with pytest.raises(TypeError, match=msg): - [] in idx - - msg = "|".join( - [ - r"unhashable type: 'dict'", - r"must be real number, not dict", - r"an integer is required", - r"\{\}", - r"pandas\._libs\.interval\.IntervalTree' is not iterable", - ] - ) - with pytest.raises(TypeError, match=msg): - {} in idx._engine - def test_copy_shares_cache(self): # GH32898, GH36840 idx = self.create_index() diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index c128f4ab6b7dd..41f8e3408d191 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -5,8 +5,7 @@ import pandas as pd import pandas._testing as tm - -from .common import Base +from pandas.tests.indexes.common import Base class DatetimeLike(Base): diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index a5abf2946feda..0360b33a4a519 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -3,8 +3,7 @@ from pandas import DatetimeIndex, date_range import pandas._testing as tm - -from ..datetimelike import DatetimeLike +from pandas.tests.indexes.datetimelike import DatetimeLike class TestDatetimeIndex(DatetimeLike): diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index a24c8e252d234..090e21be254e3 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -316,16 +316,13 @@ def test_nat(self, tz_naive_fixture): idx = DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) assert idx._can_hold_na - tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) assert idx.hasnans is False - tm.assert_numpy_array_equal(idx._nan_idxs, np.array([], dtype=np.intp)) idx = DatetimeIndex(["2011-01-01", "NaT"], tz=tz) assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) assert idx.hasnans is True - tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.intp)) @pytest.mark.parametrize("values", [["20180101", "20180103", "20180105"], []]) @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48H", Hour(48)]) diff --git a/pandas/tests/indexes/multi/conftest.py b/pandas/tests/indexes/multi/conftest.py index a77af84ee1ed0..ce477485bb21e 100644 --- a/pandas/tests/indexes/multi/conftest.py +++ b/pandas/tests/indexes/multi/conftest.py @@ -5,6 +5,7 @@ from pandas import Index, MultiIndex +# Note: identical the the "multi" entry in the top-level "index" fixture @pytest.fixture def idx(): # a MultiIndex used to test the general functionality of the @@ -49,12 +50,6 @@ def index_names(): return ["first", "second"] -@pytest.fixture -def compat_props(): - # a MultiIndex must have these properties associated with it - return ["shape", "ndim", "size"] - - @pytest.fixture def narrow_multi_index(): """ diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index 72b5ed0edaa78..d2b5a595b8454 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -35,32 +35,6 @@ def test_logical_compat(idx, method): getattr(idx, method)() -def test_boolean_context_compat(idx): - - msg = ( - "The truth value of a MultiIndex is ambiguous. " - r"Use a.empty, a.bool\(\), a.item\(\), a.any\(\) or a.all\(\)." - ) - with pytest.raises(ValueError, match=msg): - bool(idx) - - -def test_boolean_context_compat2(): - - # boolean context compat - # GH7897 - i1 = MultiIndex.from_tuples([("A", 1), ("A", 2)]) - i2 = MultiIndex.from_tuples([("A", 1), ("A", 3)]) - common = i1.intersection(i2) - - msg = ( - r"The truth value of a MultiIndex is ambiguous\. " - r"Use a\.empty, a\.bool\(\), a\.item\(\), a\.any\(\) or a\.all\(\)\." - ) - with pytest.raises(ValueError, match=msg): - bool(common) - - def test_inplace_mutation_resets_values(): levels = [["a", "b", "c"], [4]] levels2 = [[1, 2, 3], ["a"]] @@ -124,19 +98,6 @@ def test_inplace_mutation_resets_values(): assert "_values" in mi2._cache -def test_ndarray_compat_properties(idx, compat_props): - assert idx.T.equals(idx) - assert idx.transpose().equals(idx) - - values = idx.values - for prop in compat_props: - assert getattr(idx, prop) == getattr(values, prop) - - # test for validity - idx.nbytes - idx.values.nbytes - - def test_pickle_compat_construction(): # this is testing for pickle compat # need an object to create with diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 33f927bdd7c04..5f2f8f75045bb 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -1,7 +1,15 @@ import numpy as np import pytest -from pandas import Float64Index, Index, Int64Index, RangeIndex, Series, UInt64Index +from pandas import ( + Float64Index, + Index, + Int64Index, + RangeIndex, + Series, + Timestamp, + UInt64Index, +) import pandas._testing as tm @@ -102,13 +110,10 @@ def test_get_loc_na(self): idx = Float64Index([np.nan, 1, np.nan]) assert idx.get_loc(1) == 1 - # FIXME: dont leave commented-out # representable by slice [0:2:2] - # pytest.raises(KeyError, idx.slice_locs, np.nan) - sliced = idx.slice_locs(np.nan) - assert isinstance(sliced, tuple) - assert sliced == (0, 3) - + msg = "'Cannot get left slice bound for non-unique label: nan'" + with pytest.raises(KeyError, match=msg): + idx.slice_locs(np.nan) # not representable by slice idx = Float64Index([np.nan, 1, np.nan, np.nan]) assert idx.get_loc(1) == 1 @@ -128,6 +133,14 @@ def test_get_loc_missing_nan(self): # listlike/non-hashable raises TypeError idx.get_loc([np.nan]) + @pytest.mark.parametrize("vals", [[1], [1.0], [Timestamp("2019-12-31")], ["test"]]) + @pytest.mark.parametrize("method", ["nearest", "pad", "backfill"]) + def test_get_loc_float_index_nan_with_method(self, vals, method): + # GH#39382 + idx = Index(vals) + with pytest.raises(KeyError, match="nan"): + idx.get_loc(np.nan, method=method) + class TestGetIndexer: def test_get_indexer(self): diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 645019f1ac063..fd0a77bf7930b 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -278,14 +278,12 @@ def test_nat(self): tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) assert idx.hasnans is False - tm.assert_numpy_array_equal(idx._nan_idxs, np.array([], dtype=np.intp)) idx = PeriodIndex(["2011-01-01", "NaT"], freq="D") assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) assert idx.hasnans is True - tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.intp)) def test_freq_setter_deprecated(self): # GH 20678 diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index b0de16a25bcc3..fd4b34a1b32a9 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -2,7 +2,6 @@ import pytest from pandas._libs.tslibs.period import IncompatibleFrequency -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -18,8 +17,7 @@ period_range, ) import pandas._testing as tm - -from ..datetimelike import DatetimeLike +from pandas.tests.indexes.datetimelike import DatetimeLike class TestPeriodIndex(DatetimeLike): @@ -329,10 +327,6 @@ def test_shift(self): # This is tested in test_arithmetic pass - @td.skip_if_32bit - def test_ndarray_compat_properties(self): - super().test_ndarray_compat_properties() - def test_negative_ordinals(self): Period(ordinal=-1000, freq="A") Period(ordinal=0, freq="A") diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 40cd812ebe368..57df2a1e83418 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -6,8 +6,7 @@ import pandas as pd from pandas import Float64Index, Index, Int64Index, RangeIndex import pandas._testing as tm - -from ..test_numeric import Numeric +from pandas.tests.indexes.test_numeric import Numeric # aliases to make some tests easier to read RI = RangeIndex @@ -18,7 +17,6 @@ class TestRangeIndex(Numeric): _holder = RangeIndex - _compat_props = ["shape", "ndim", "size"] @pytest.fixture( params=[ diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index c8629fdf1e3a6..c9c86f9eebde9 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -11,10 +11,14 @@ def test_boolean_context_compat(index): + # GH#7897 with pytest.raises(ValueError, match="The truth value of a"): if index: pass + with pytest.raises(ValueError, match="The truth value of a"): + bool(index) + def test_sort(index): msg = "cannot sort an Index object in-place, use sort_values instead" @@ -27,6 +31,12 @@ def test_hash_error(index): hash(index) +def test_copy_dtype_deprecated(index): + # GH#35853 + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + index.copy(dtype=object) + + def test_mutability(index): if not len(index): return diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 092b1c447eb0d..5fd1a15416e23 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -878,14 +878,11 @@ def test_difference_name_preservation(self, index, second_name, expected, sort): assert result.name == expected def test_difference_empty_arg(self, index, sort): - if isinstance(index, MultiIndex): - pytest.skip("Not applicable") first = index[5:20] first.name = "name" result = first.difference([], sort) - assert tm.equalContents(result, first) - assert result.name == first.name + tm.assert_index_equal(result, first) @pytest.mark.parametrize("index", ["string"], indirect=True) def test_difference_identity(self, index, sort): @@ -1598,16 +1595,9 @@ async def test_tab_complete_warning(self, ip): code = "import pandas as pd; idx = Index([1, 2])" await ip.run_code(code) - # GH 31324 newer jedi version raises Deprecation warning - import jedi - - if jedi.__version__ < "0.16.0": - warning = tm.assert_produces_warning(None) - else: - warning = tm.assert_produces_warning( - DeprecationWarning, check_stacklevel=False - ) - with warning: + # GH 31324 newer jedi version raises Deprecation warning; + # appears resolved 2021-02-02 + with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("idx.", 4)) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 2b49ea00d3322..d622ea359bc53 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -9,6 +9,7 @@ import pytest from pandas._libs.tslibs import iNaT +from pandas.compat import IS64 from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion @@ -51,11 +52,9 @@ def test_droplevel(self, index): ): index.droplevel(level) - def test_constructor_non_hashable_name(self, index): + def test_constructor_non_hashable_name(self, index_flat): # GH 20527 - - if isinstance(index, MultiIndex): - pytest.skip("multiindex handled in test_multi.py") + index = index_flat message = "Index.name must be a hashable type" renamed = [["1"]] @@ -68,27 +67,23 @@ def test_constructor_non_hashable_name(self, index): with pytest.raises(TypeError, match=message): index.set_names(names=renamed) - def test_constructor_unwraps_index(self, index): - if isinstance(index, pd.MultiIndex): - raise pytest.skip("MultiIndex has no ._data") - a = index + def test_constructor_unwraps_index(self, index_flat): + a = index_flat b = type(a)(a) tm.assert_equal(a._data, b._data) - def test_to_flat_index(self, index): + def test_to_flat_index(self, index_flat): # 22866 - if isinstance(index, MultiIndex): - pytest.skip("Separate expectation for MultiIndex") + index = index_flat result = index.to_flat_index() tm.assert_index_equal(result, index) - def test_set_name_methods(self, index): + def test_set_name_methods(self, index_flat): + # MultiIndex tested separately + index = index_flat new_name = "This is the new name for this index" - # don't tests a MultiIndex here (as its tested separated) - if isinstance(index, MultiIndex): - pytest.skip("Skip check for MultiIndex") original_name = index.name new_ind = index.set_names([new_name]) assert new_ind.name == new_name @@ -112,11 +107,10 @@ def test_set_name_methods(self, index): assert index.name == name assert index.names == [name] - def test_copy_and_deepcopy(self, index): + def test_copy_and_deepcopy(self, index_flat): from copy import copy, deepcopy - if isinstance(index, MultiIndex): - pytest.skip("Skip check for MultiIndex") + index = index_flat for func in (copy, deepcopy): idx_copy = func(index) @@ -126,10 +120,9 @@ def test_copy_and_deepcopy(self, index): new_copy = index.copy(deep=True, name="banana") assert new_copy.name == "banana" - def test_unique(self, index): + def test_unique(self, index_flat): # don't test a MultiIndex here (as its tested separated) - if isinstance(index, MultiIndex): - pytest.skip("Skip check for MultiIndex") + index = index_flat # GH 17896 expected = index.drop_duplicates() @@ -148,9 +141,10 @@ def test_unique(self, index): with pytest.raises(KeyError, match=msg): index.unique(level="wrong") - def test_get_unique_index(self, index): + def test_get_unique_index(self, index_flat): # MultiIndex tested separately - if not len(index) or isinstance(index, MultiIndex): + index = index_flat + if not len(index): pytest.skip("Skip check for empty Index and MultiIndex") idx = index[[0] * 5] @@ -199,11 +193,12 @@ def test_get_unique_index(self, index): result = i._get_unique_index(dropna=dropna) tm.assert_index_equal(result, expected) - def test_searchsorted_monotonic(self, index): + def test_searchsorted_monotonic(self, index_flat): # GH17271 + index = index_flat # not implemented for tuple searches in MultiIndex # or Intervals searches in IntervalIndex - if isinstance(index, (MultiIndex, pd.IntervalIndex)): + if isinstance(index, pd.IntervalIndex): pytest.skip("Skip check for MultiIndex/IntervalIndex") # nothing to test if the index is empty @@ -244,9 +239,9 @@ def test_searchsorted_monotonic(self, index): with pytest.raises(ValueError, match=msg): index._searchsorted_monotonic(value, side="left") - def test_drop_duplicates(self, index, keep): - if isinstance(index, MultiIndex): - pytest.skip("MultiIndex is tested separately") + def test_drop_duplicates(self, index_flat, keep): + # MultiIndex is tested separately + index = index_flat if isinstance(index, RangeIndex): pytest.skip( "RangeIndex is tested in test_drop_duplicates_no_duplicates " @@ -278,9 +273,9 @@ def test_drop_duplicates(self, index, keep): expected_dropped = holder(pd.Series(idx).drop_duplicates(keep=keep)) tm.assert_index_equal(idx.drop_duplicates(keep=keep), expected_dropped) - def test_drop_duplicates_no_duplicates(self, index): - if isinstance(index, MultiIndex): - pytest.skip("MultiIndex is tested separately") + def test_drop_duplicates_no_duplicates(self, index_flat): + # MultiIndex is tested separately + index = index_flat # make unique index if isinstance(index, RangeIndex): @@ -304,9 +299,12 @@ def test_drop_duplicates_inplace(self, index): with pytest.raises(TypeError, match=msg): index.drop_duplicates(inplace=True) - def test_has_duplicates(self, index): + def test_has_duplicates(self, index_flat): + # MultiIndex tested separately in: + # tests/indexes/multi/test_unique_and_duplicates. + index = index_flat holder = type(index) - if not len(index) or isinstance(index, (MultiIndex, RangeIndex)): + if not len(index) or isinstance(index, RangeIndex): # MultiIndex tested separately in: # tests/indexes/multi/test_unique_and_duplicates. # RangeIndex is unique by definition. @@ -362,29 +360,18 @@ def test_asi8_deprecation(self, index): @pytest.mark.parametrize("na_position", [None, "middle"]) -def test_sort_values_invalid_na_position(request, index_with_missing, na_position): - if isinstance(index_with_missing, MultiIndex): - request.node.add_marker( - pytest.mark.xfail( - reason="missing value sorting order not defined for index type" - ) - ) +def test_sort_values_invalid_na_position(index_with_missing, na_position): - if na_position not in ["first", "last"]: - with pytest.raises(ValueError, match=f"invalid na_position: {na_position}"): - index_with_missing.sort_values(na_position=na_position) + with pytest.raises(ValueError, match=f"invalid na_position: {na_position}"): + index_with_missing.sort_values(na_position=na_position) @pytest.mark.parametrize("na_position", ["first", "last"]) -def test_sort_values_with_missing(request, index_with_missing, na_position): +def test_sort_values_with_missing(index_with_missing, na_position): # GH 35584. Test that sort_values works with missing values, # sort non-missing and place missing according to na_position - if isinstance(index_with_missing, MultiIndex): - request.node.add_marker( - pytest.mark.xfail(reason="missing value sorting order not implemented") - ) - elif isinstance(index_with_missing, CategoricalIndex): + if isinstance(index_with_missing, CategoricalIndex): pytest.skip("missing value sorting order not well-defined") missing_count = np.sum(index_with_missing.isna()) @@ -398,3 +385,25 @@ def test_sort_values_with_missing(request, index_with_missing, na_position): result = index_with_missing.sort_values(na_position=na_position) tm.assert_index_equal(result, expected) + + +def test_ndarray_compat_properties(index): + if isinstance(index, PeriodIndex) and not IS64: + pytest.skip("Overflow") + idx = index + assert idx.T.equals(idx) + assert idx.transpose().equals(idx) + + values = idx.values + + assert idx.shape == values.shape + assert idx.ndim == values.ndim + assert idx.size == values.size + + if not isinstance(index, (RangeIndex, MultiIndex)): + # These two are not backed by an ndarray + assert idx.nbytes == values.nbytes + + # test for validity + idx.nbytes + idx.values.nbytes diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index 04f7a65bd5c56..8d2637e4a06f6 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -24,6 +24,7 @@ Index, Int64Index, IntervalIndex, + MultiIndex, PeriodIndex, Series, TimedeltaIndex, @@ -140,6 +141,26 @@ def test_contains_with_float_index(self): assert 1.0 not in float_index assert 1 not in float_index + def test_contains_requires_hashable_raises(self, index): + if isinstance(index, MultiIndex): + return # TODO: do we want this to raise? + + msg = "unhashable type: 'list'" + with pytest.raises(TypeError, match=msg): + [] in index + + msg = "|".join( + [ + r"unhashable type: 'dict'", + r"must be real number, not dict", + r"an integer is required", + r"\{\}", + r"pandas\._libs\.interval\.IntervalTree' is not iterable", + ] + ) + with pytest.raises(TypeError, match=msg): + {} in index._engine + class TestGetValue: @pytest.mark.parametrize( @@ -161,19 +182,30 @@ def test_get_value(self, index): class TestGetIndexer: + def test_get_indexer_base(self, index): + + if index._index_as_unique: + expected = np.arange(index.size, dtype=np.intp) + actual = index.get_indexer(index) + tm.assert_numpy_array_equal(expected, actual) + else: + msg = "Reindexing only valid with uniquely valued Index objects" + with pytest.raises(InvalidIndexError, match=msg): + index.get_indexer(index) + + with pytest.raises(ValueError, match="Invalid fill method"): + index.get_indexer(index, method="invalid") + def test_get_indexer_consistency(self, index): # See GH#16819 - if isinstance(index, IntervalIndex): - # requires index.is_non_overlapping - return - if index.is_unique: + if index._index_as_unique: indexer = index.get_indexer(index[0:2]) assert isinstance(indexer, np.ndarray) assert indexer.dtype == np.intp else: - e = "Reindexing only valid with uniquely valued Index objects" - with pytest.raises(InvalidIndexError, match=e): + msg = "Reindexing only valid with uniquely valued Index objects" + with pytest.raises(InvalidIndexError, match=msg): index.get_indexer(index[0:2]) indexer, _ = index.get_indexer_non_unique(index[0:2]) @@ -197,6 +229,25 @@ def test_convert_almost_null_slice(self, index): index._convert_slice_indexer(key, "loc") +class TestPutmask: + def test_putmask_with_wrong_mask(self, index): + # GH#18368 + if not len(index): + return + + fill = index[0] + + msg = "putmask: mask and data must be the same size" + with pytest.raises(ValueError, match=msg): + index.putmask(np.ones(len(index) + 1, np.bool_), fill) + + with pytest.raises(ValueError, match=msg): + index.putmask(np.ones(len(index) - 1, np.bool_), fill) + + with pytest.raises(ValueError, match=msg): + index.putmask("foo", fill) + + @pytest.mark.parametrize( "idx", [Index([1, 2, 3]), Index([0.1, 0.2, 0.3]), Index(["a", "b", "c"])] ) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index e391b76ddbd15..d6b92999305b2 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -204,12 +204,17 @@ def test_constructor_invalid(self): ) with pytest.raises(TypeError, match=msg): Float64Index(0.0) - msg = ( - "String dtype not supported, " - "you may need to explicitly cast to a numeric type" + + # 2021-02-1 we get ValueError in numpy 1.20, but not on all builds + msg = "|".join( + [ + "String dtype not supported, you may need to explicitly cast ", + "could not convert string to float: 'a'", + ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises((TypeError, ValueError), match=msg): Float64Index(["a", "b", 0.0]) + msg = r"float\(\) argument must be a string or a number, not 'Timestamp'" with pytest.raises(TypeError, match=msg): Float64Index([Timestamp("20130101")]) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index f2a33df71e8e3..746b6d6fb6e2a 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -38,33 +38,39 @@ def test_union_same_types(index): assert idx1.union(idx2).dtype == idx1.dtype -def test_union_different_types(request, index, index_fixture2): +def test_union_different_types(index_flat, index_flat2): # This test only considers combinations of indices # GH 23525 - idx1, idx2 = index, index_fixture2 - type_pair = tuple(sorted([type(idx1), type(idx2)], key=lambda x: str(x))) - if type_pair in COMPATIBLE_INCONSISTENT_PAIRS: - request.node.add_marker( - pytest.mark.xfail(reason="This test only considers non compatible indexes.") - ) - - if any(isinstance(idx, pd.MultiIndex) for idx in (idx1, idx2)): - pytest.skip("This test doesn't consider multiindixes.") + idx1 = index_flat + idx2 = index_flat2 - if is_dtype_equal(idx1.dtype, idx2.dtype): - pytest.skip("This test only considers non matching dtypes.") - - # A union with a CategoricalIndex (even as dtype('O')) and a - # non-CategoricalIndex can only be made if both indices are monotonic. - # This is true before this PR as well. + type_pair = tuple(sorted([type(idx1), type(idx2)], key=lambda x: str(x))) # Union with a non-unique, non-monotonic index raises error # This applies to the boolean index idx1 = idx1.sort_values() idx2 = idx2.sort_values() - assert idx1.union(idx2).dtype == np.dtype("O") - assert idx2.union(idx1).dtype == np.dtype("O") + res1 = idx1.union(idx2) + res2 = idx2.union(idx1) + + if is_dtype_equal(idx1.dtype, idx2.dtype): + assert res1.dtype == idx1.dtype + assert res2.dtype == idx1.dtype + + elif type_pair not in COMPATIBLE_INCONSISTENT_PAIRS: + # A union with a CategoricalIndex (even as dtype('O')) and a + # non-CategoricalIndex can only be made if both indices are monotonic. + # This is true before this PR as well. + assert res1.dtype == np.dtype("O") + assert res2.dtype == np.dtype("O") + + elif idx1.dtype.kind in ["f", "i", "u"] and idx2.dtype.kind in ["f", "i", "u"]: + assert res1.dtype == np.dtype("f8") + assert res2.dtype == np.dtype("f8") + + else: + raise NotImplementedError @pytest.mark.parametrize("idx_fact1,idx_fact2", COMPATIBLE_INCONSISTENT_PAIRS.values()) @@ -275,12 +281,12 @@ def test_symmetric_difference(self, index): (None, None, None), ], ) - def test_corner_union(self, index, fname, sname, expected_name): + def test_corner_union(self, index_flat, fname, sname, expected_name): # GH#9943, GH#9862 # Test unions with various name combinations # Do not test MultiIndex or repeats - - if isinstance(index, MultiIndex) or not index.is_unique: + index = index_flat + if not index.is_unique: pytest.skip("Not for MultiIndex or repeated indices") # Test copy.union(copy) @@ -321,8 +327,9 @@ def test_corner_union(self, index, fname, sname, expected_name): (None, None, None), ], ) - def test_union_unequal(self, index, fname, sname, expected_name): - if isinstance(index, MultiIndex) or not index.is_unique: + def test_union_unequal(self, index_flat, fname, sname, expected_name): + index = index_flat + if not index.is_unique: pytest.skip("Not for MultiIndex or repeated indices") # test copy.union(subset) - need sort for unicode and string @@ -342,11 +349,11 @@ def test_union_unequal(self, index, fname, sname, expected_name): (None, None, None), ], ) - def test_corner_intersect(self, index, fname, sname, expected_name): + def test_corner_intersect(self, index_flat, fname, sname, expected_name): # GH#35847 # Test intersections with various name combinations - - if isinstance(index, MultiIndex) or not index.is_unique: + index = index_flat + if not index.is_unique: pytest.skip("Not for MultiIndex or repeated indices") # Test copy.intersection(copy) @@ -387,8 +394,9 @@ def test_corner_intersect(self, index, fname, sname, expected_name): (None, None, None), ], ) - def test_intersect_unequal(self, index, fname, sname, expected_name): - if isinstance(index, MultiIndex) or not index.is_unique: + def test_intersect_unequal(self, index_flat, fname, sname, expected_name): + index = index_flat + if not index.is_unique: pytest.skip("Not for MultiIndex or repeated indices") # test copy.intersection(subset) - need sort for unicode and string diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 3578174e17141..83b8fcc1b15fe 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -217,14 +217,12 @@ def test_nat(self): tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) assert idx.hasnans is False - tm.assert_numpy_array_equal(idx._nan_idxs, np.array([], dtype=np.intp)) idx = TimedeltaIndex(["1 days", "NaT"]) assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) assert idx.hasnans is True - tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.intp)) @pytest.mark.parametrize("values", [["0 days", "2 days", "4 days"], []]) @pytest.mark.parametrize("freq", ["2D", Day(2), "48H", Hour(48)]) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index a86cd8dd11c59..d16a32247b917 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -14,8 +14,7 @@ timedelta_range, ) import pandas._testing as tm - -from ..datetimelike import DatetimeLike +from pandas.tests.indexes.datetimelike import DatetimeLike randn = np.random.randn diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 6c0d1c285acf3..d0ef95d2fa56c 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -229,6 +229,85 @@ def test_frame_getitem_nan_multiindex(nulls_fixture): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "indexer,expected", + [ + ( + (["b"], ["bar", np.nan]), + ( + DataFrame( + [[2, 3], [5, 6]], + columns=MultiIndex.from_tuples([("b", "bar"), ("b", np.nan)]), + dtype="int64", + ) + ), + ), + ( + (["a", "b"]), + ( + DataFrame( + [[1, 2, 3], [4, 5, 6]], + columns=MultiIndex.from_tuples( + [("a", "foo"), ("b", "bar"), ("b", np.nan)] + ), + dtype="int64", + ) + ), + ), + ( + (["b"]), + ( + DataFrame( + [[2, 3], [5, 6]], + columns=MultiIndex.from_tuples([("b", "bar"), ("b", np.nan)]), + dtype="int64", + ) + ), + ), + ( + (["b"], ["bar"]), + ( + DataFrame( + [[2], [5]], + columns=MultiIndex.from_tuples([("b", "bar")]), + dtype="int64", + ) + ), + ), + ( + (["b"], [np.nan]), + ( + DataFrame( + [[3], [6]], + columns=MultiIndex( + codes=[[1], [-1]], levels=[["a", "b"], ["bar", "foo"]] + ), + dtype="int64", + ) + ), + ), + (("b", np.nan), Series([3, 6], dtype="int64", name=("b", np.nan))), + ], +) +def test_frame_getitem_nan_cols_multiindex( + indexer, + expected, + nulls_fixture, +): + # Slicing MultiIndex including levels with nan values, for more information + # see GH#25154 + df = DataFrame( + [[1, 2, 3], [4, 5, 6]], + columns=MultiIndex.from_tuples( + [("a", "foo"), ("b", "bar"), ("b", nulls_fixture)] + ), + dtype="int64", + ) + + result = df.loc[:, indexer] + tm.assert_equal(result, expected) + + # ---------------------------------------------------------------------------- # test indexing of DataFrame with multi-level Index with duplicates # ---------------------------------------------------------------------------- diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index b60135a802b8e..6c7d5f06ac355 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -780,6 +780,42 @@ def test_non_reducing_slice_on_multiindex(self): expected = DataFrame({("b", "d"): [4, 1]}) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "slice_", + [ + pd.IndexSlice[:, :], + # check cols + pd.IndexSlice[:, pd.IndexSlice[["a"]]], # inferred deeper need list + pd.IndexSlice[:, pd.IndexSlice[["a"], ["c"]]], # inferred deeper need list + pd.IndexSlice[:, pd.IndexSlice["a", "c", :]], + pd.IndexSlice[:, pd.IndexSlice["a", :, "e"]], + pd.IndexSlice[:, pd.IndexSlice[:, "c", "e"]], + pd.IndexSlice[:, pd.IndexSlice["a", ["c", "d"], :]], # check list + pd.IndexSlice[:, pd.IndexSlice["a", ["c", "d", "-"], :]], # allow missing + pd.IndexSlice[:, pd.IndexSlice["a", ["c", "d", "-"], "e"]], # no slice + # check rows + pd.IndexSlice[pd.IndexSlice[["U"]], :], # inferred deeper need list + pd.IndexSlice[pd.IndexSlice[["U"], ["W"]], :], # inferred deeper need list + pd.IndexSlice[pd.IndexSlice["U", "W", :], :], + pd.IndexSlice[pd.IndexSlice["U", :, "Y"], :], + pd.IndexSlice[pd.IndexSlice[:, "W", "Y"], :], + pd.IndexSlice[pd.IndexSlice[:, "W", ["Y", "Z"]], :], # check list + pd.IndexSlice[pd.IndexSlice[:, "W", ["Y", "Z", "-"]], :], # allow missing + pd.IndexSlice[pd.IndexSlice["U", "W", ["Y", "Z", "-"]], :], # no slice + # check simultaneous + pd.IndexSlice[pd.IndexSlice[:, "W", "Y"], pd.IndexSlice["a", "c", :]], + ], + ) + def test_non_reducing_multi_slice_on_multiindex(self, slice_): + # GH 33562 + cols = pd.MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]) + idxs = pd.MultiIndex.from_product([["U", "V"], ["W", "X"], ["Y", "Z"]]) + df = DataFrame(np.arange(64).reshape(8, 8), columns=cols, index=idxs) + + expected = df.loc[slice_] + result = df.loc[non_reducing_slice(slice_)] + tm.assert_frame_equal(result, expected) + def test_loc_slice_negative_stepsize(self): # GH#38071 mi = MultiIndex.from_product([["a", "b"], [0, 1]]) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 6c8b1622e76aa..69b9e63d7e215 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -161,34 +161,19 @@ def test_setitem_series_complex128(self, val, exp_dtype): @pytest.mark.parametrize( "val,exp_dtype", [ - (1, np.int64), - (3, np.int64), - (1.1, np.float64), - (1 + 1j, np.complex128), + (1, object), + ("3", object), + (3, object), + (1.1, object), + (1 + 1j, object), (True, np.bool_), ], ) - def test_setitem_series_bool(self, val, exp_dtype, request): + def test_setitem_series_bool(self, val, exp_dtype): obj = pd.Series([True, False, True, False]) assert obj.dtype == np.bool_ - mark = None - if exp_dtype is np.int64: - exp = pd.Series([True, True, True, False]) - self._assert_setitem_series_conversion(obj, val, exp, np.bool_) - mark = pytest.mark.xfail(reason="TODO_GH12747 The result must be int") - elif exp_dtype is np.float64: - exp = pd.Series([True, True, True, False]) - self._assert_setitem_series_conversion(obj, val, exp, np.bool_) - mark = pytest.mark.xfail(reason="TODO_GH12747 The result must be float") - elif exp_dtype is np.complex128: - exp = pd.Series([True, True, True, False]) - self._assert_setitem_series_conversion(obj, val, exp, np.bool_) - mark = pytest.mark.xfail(reason="TODO_GH12747 The result must be complex") - if mark is not None: - request.node.add_marker(mark) - - exp = pd.Series([True, val, True, False]) + exp = pd.Series([True, val, True, False], dtype=exp_dtype) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) @pytest.mark.parametrize( diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index d490a23317fef..686f383deab37 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -14,8 +14,7 @@ import pandas._testing as tm from pandas.core.indexing import maybe_numeric_slice, non_reducing_slice from pandas.tests.indexing.common import _mklbl - -from .test_floats import gen_obj +from pandas.tests.indexing.test_floats import gen_obj # ------------------------------------------------------------------------ # Indexing test cases diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index c99d9ae62bf54..ddc3c42710a61 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -36,11 +36,3 @@ def test_read_writer_table(): result = pd.read_excel("writertable.odt", sheet_name="Table1", index_col=0) tm.assert_frame_equal(result, expected) - - -def test_nonexistent_sheetname_raises(read_ext): - # GH-27676 - # Specifying a non-existent sheet_name parameter should throw an error - # with the sheet name. - with pytest.raises(ValueError, match="Worksheet named 'xyz' not found"): - pd.read_excel("blank.ods", sheet_name="xyz") diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 98e45a7f18f96..613a32d478c91 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -1,6 +1,10 @@ +from distutils.version import LooseVersion + import numpy as np import pytest +from pandas.compat._optional import get_version + import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -135,6 +139,10 @@ def test_to_excel_with_openpyxl_engine(ext): @pytest.mark.parametrize( "filename", ["dimension_missing", "dimension_small", "dimension_large"] ) +@pytest.mark.xfail( + LooseVersion(get_version(openpyxl)) < "3.0.0", + reason="openpyxl read-only sheet is incorrect when dimension data is wrong", +) def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename): # GH 38956, 39001 - no/incorrect dimension information path = datapath("io", "data", "excel", f"{filename}{ext}") @@ -143,6 +151,51 @@ def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename) tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "header, expected_data", + [ + ( + 0, + { + "Title": [np.nan, "A", 1, 2, 3], + "Unnamed: 1": [np.nan, "B", 4, 5, 6], + "Unnamed: 2": [np.nan, "C", 7, 8, 9], + }, + ), + (2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}), + ], +) +@pytest.mark.parametrize( + "filename", ["dimension_missing", "dimension_small", "dimension_large"] +) +@pytest.mark.parametrize("read_only", [True, False]) +@pytest.mark.xfail( + LooseVersion(get_version(openpyxl)) < "3.0.0", + reason="openpyxl read-only sheet is incorrect when dimension data is wrong", +) +def test_read_wb_with_bad_dimension( + datapath, ext, filename, header, expected_data, read_only +): + # GH 38956, 39001 - no/incorrect dimension information + path = datapath("io", "data", "excel", f"{filename}{ext}") + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl", header=header) + wb.close() + expected = DataFrame(expected_data) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("read_only", [True, False]) +def test_read_workbook(datapath, ext, read_only): + # GH 39528 + filename = datapath("io", "data", "excel", "test1" + ext) + wb = openpyxl.load_workbook(filename, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() + expected = pd.read_excel(filename) + tm.assert_frame_equal(result, expected) + + def test_read_with_empty_trailing_rows(datapath, ext): # GH 39181 path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}") @@ -155,3 +208,17 @@ def test_read_with_empty_trailing_rows(datapath, ext): } ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_empty_with_blank_row(datapath, ext, read_only): + # GH 39547 - empty excel file with a row that has no data + path = datapath("io", "data", "excel", f"empty_with_blank_row{ext}") + if read_only is None: + result = pd.read_excel(path) + else: + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() + expected = DataFrame() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b2e87de5580e6..a594718bd62d9 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -2,6 +2,7 @@ from functools import partial import os from urllib.error import URLError +from zipfile import BadZipFile import numpy as np import pytest @@ -685,7 +686,13 @@ def test_missing_file_raises(self, read_ext): def test_corrupt_bytes_raises(self, read_ext, engine): bad_stream = b"foo" - with pytest.raises(ValueError, match="File is not a recognized excel file"): + if engine is None or engine == "xlrd": + error = ValueError + msg = "File is not a recognized excel file" + else: + error = BadZipFile + msg = "File is not a zip file" + with pytest.raises(error, match=msg): pd.read_excel(bad_stream) @tm.network diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 29055837a0721..c0d8acf8ab562 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -40,15 +40,6 @@ def test_read_xlrd_book(read_ext, frame): tm.assert_frame_equal(df, result) -# TODO: test for openpyxl as well -def test_excel_table_sheet_by_index(datapath, read_ext): - path = datapath("io", "data", "excel", f"test1{read_ext}") - msg = "Worksheet named 'invalid_sheet_name' not found" - with ExcelFile(path, engine="xlrd") as excel: - with pytest.raises(ValueError, match=msg): - pd.read_excel(excel, sheet_name="invalid_sheet_name") - - def test_excel_file_warning_with_xlsx_file(datapath): # GH 29375 path = datapath("io", "data", "excel", "test1.xlsx") diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 6556075272308..d2c5b5b9d0b2c 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -12,7 +12,11 @@ import pandas._testing as tm jinja2 = pytest.importorskip("jinja2") -from pandas.io.formats.style import Styler, _get_level_lengths # isort:skip +from pandas.io.formats.style import ( # isort:skip + Styler, + _get_level_lengths, + _maybe_convert_css_to_tuples, +) class TestStyler: @@ -377,29 +381,26 @@ def f(x): } assert result == expected - def test_applymap_subset_multiindex(self): + @pytest.mark.parametrize( + "slice_", + [ + pd.IndexSlice[:, pd.IndexSlice["x", "A"]], + pd.IndexSlice[:, pd.IndexSlice[:, "A"]], + pd.IndexSlice[:, pd.IndexSlice[:, ["A", "C"]]], # missing col element + pd.IndexSlice[pd.IndexSlice["a", 1], :], + pd.IndexSlice[pd.IndexSlice[:, 1], :], + pd.IndexSlice[pd.IndexSlice[:, [1, 3]], :], # missing row element + pd.IndexSlice[:, ("x", "A")], + pd.IndexSlice[("a", 1), :], + ], + ) + def test_applymap_subset_multiindex(self, slice_): # GH 19861 - # Smoke test for applymap - def color_negative_red(val): - """ - Takes a scalar and returns a string with - the css property `'color: red'` for negative - strings, black otherwise. - """ - color = "red" if val < 0 else "black" - return f"color: {color}" - - dic = { - ("a", "d"): [-1.12, 2.11], - ("a", "c"): [2.78, -2.88], - ("b", "c"): [-3.99, 3.77], - ("b", "d"): [4.21, -1.22], - } - - idx = pd.IndexSlice - df = DataFrame(dic, index=[0, 1]) - - (df.style.applymap(color_negative_red, subset=idx[:, idx["b", "d"]]).render()) + # edited for GH 33562 + idx = pd.MultiIndex.from_product([["a", "b"], [1, 2]]) + col = pd.MultiIndex.from_product([["x", "y"], ["A", "B"]]) + df = DataFrame(np.random.rand(4, 4), columns=col, index=idx) + df.style.applymap(lambda x: "color: red;", subset=slice_).render() def test_applymap_subset_multiindex_code(self): # https://github.com/pandas-dev/pandas/issues/25858 @@ -1170,7 +1171,7 @@ def test_unique_id(self): assert np.unique(ids).size == len(ids) def test_table_styles(self): - style = [{"selector": "th", "props": [("foo", "bar")]}] + style = [{"selector": "th", "props": [("foo", "bar")]}] # default format styler = Styler(self.df, table_styles=style) result = " ".join(styler.render().split()) assert "th { foo: bar; }" in result @@ -1180,6 +1181,24 @@ def test_table_styles(self): assert styler is result assert styler.table_styles == style + # GH 39563 + style = [{"selector": "th", "props": "foo:bar;"}] # css string format + styler = self.df.style.set_table_styles(style) + result = " ".join(styler.render().split()) + assert "th { foo: bar; }" in result + + def test_maybe_convert_css_to_tuples(self): + expected = [("a", "b"), ("c", "d e")] + assert _maybe_convert_css_to_tuples("a:b;c:d e;") == expected + assert _maybe_convert_css_to_tuples("a: b ;c: d e ") == expected + expected = [] + assert _maybe_convert_css_to_tuples("") == expected + + def test_maybe_convert_css_to_tuples_err(self): + msg = "Styles supplied as string must follow CSS rule formats" + with pytest.raises(ValueError, match=msg): + _maybe_convert_css_to_tuples("err") + def test_table_attributes(self): attributes = 'class="foo" data-bar' styler = Styler(self.df, table_attributes=attributes) @@ -1900,6 +1919,18 @@ def test_tooltip_class(self): in s ) + # GH 39563 + s = ( + Styler(df, uuid_len=0) + .set_tooltips(DataFrame([["tooltip"]])) + .set_tooltips_class(name="other-class", properties="color:green;color:red;") + .render() + ) + assert ( + "#T__ .other-class {\n color: green;\n color: red;\n " + in s + ) + @td.skip_if_no_mpl class TestStylerMatplotlibDep: diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 25d98928f1a6b..a510286d5412e 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -34,6 +34,43 @@ date_strategy = st.datetimes() +def test_read_csv_with_custom_date_parser(all_parsers): + # GH36111 + def __custom_date_parser(time): + time = time.astype(np.float_) + time = time.astype(np.int_) # convert float seconds to int type + return pd.to_timedelta(time, unit="s") + + testdata = StringIO( + """time e n h + 41047.00 -98573.7297 871458.0640 389.0089 + 41048.00 -98573.7299 871458.0640 389.0089 + 41049.00 -98573.7300 871458.0642 389.0088 + 41050.00 -98573.7299 871458.0643 389.0088 + 41051.00 -98573.7302 871458.0640 389.0086 + """ + ) + result = all_parsers.read_csv( + testdata, + delim_whitespace=True, + parse_dates=True, + date_parser=__custom_date_parser, + index_col="time", + ) + time = [41047, 41048, 41049, 41050, 41051] + time = pd.TimedeltaIndex([pd.to_timedelta(i, unit="s") for i in time], name="time") + expected = DataFrame( + { + "e": [-98573.7297, -98573.7299, -98573.7300, -98573.7299, -98573.7302], + "n": [871458.0640, 871458.0640, 871458.0642, 871458.0643, 871458.0640], + "h": [389.0089, 389.0089, 389.0088, 389.0088, 389.0086], + }, + index=time, + ) + + tm.assert_frame_equal(result, expected) + + def test_separator_date_conflict(all_parsers): # Regression test for gh-4678 # diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 5897b91a5fa70..058dc7659fc95 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2003,3 +2003,48 @@ def test_precision_loss(): tm.assert_series_equal(reread.dtypes, expected_dt) assert reread.loc[0, "little"] == df.loc[0, "little"] assert reread.loc[0, "big"] == float(df.loc[0, "big"]) + + +def test_compression_roundtrip(compression): + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + df.index.name = "index" + + with tm.ensure_clean() as path: + + df.to_stata(path, compression=compression) + reread = read_stata(path, compression=compression, index_col="index") + tm.assert_frame_equal(df, reread) + + # explicitly ensure file was compressed. + with tm.decompress_file(path, compression) as fh: + contents = io.BytesIO(fh.read()) + reread = pd.read_stata(contents, index_col="index") + tm.assert_frame_equal(df, reread) + + +@pytest.mark.parametrize("to_infer", [True, False]) +@pytest.mark.parametrize("read_infer", [True, False]) +def test_stata_compression(compression_only, read_infer, to_infer): + compression = compression_only + + ext = "gz" if compression == "gzip" else compression + filename = f"test.{ext}" + + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + df.index.name = "index" + + to_compression = "infer" if to_infer else compression + read_compression = "infer" if read_infer else compression + + with tm.ensure_clean(filename) as path: + df.to_stata(path, compression=to_compression) + result = pd.read_stata(path, compression=read_compression, index_col="index") + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 8393312004241..a28e2f22560eb 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -155,6 +155,28 @@ def test_tracemalloc_for_empty(self, table_type, dtype): del table assert get_allocated_khash_memory() == 0 + def test_get_state(self, table_type, dtype): + table = table_type(1000) + state = table.get_state() + assert state["size"] == 0 + assert state["n_occupied"] == 0 + assert "n_buckets" in state + assert "upper_bound" in state + + def test_no_reallocation(self, table_type, dtype): + for N in range(1, 110): + keys = np.arange(N).astype(dtype) + preallocated_table = table_type(N) + n_buckets_start = preallocated_table.get_state()["n_buckets"] + preallocated_table.map_locations(keys) + n_buckets_end = preallocated_table.get_state()["n_buckets"] + # orgininal number of buckets was enough: + assert n_buckets_start == n_buckets_end + # check with clean table (not too much preallocated) + clean_table = table_type() + clean_table.map_locations(keys) + assert n_buckets_start == clean_table.get_state()["n_buckets"] + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() @@ -190,6 +212,21 @@ def test_tracemalloc_for_empty_StringHashTable(): assert get_allocated_khash_memory() == 0 +def test_no_reallocation_StringHashTable(): + for N in range(1, 110): + keys = np.arange(N).astype(np.compat.unicode).astype(np.object_) + preallocated_table = ht.StringHashTable(N) + n_buckets_start = preallocated_table.get_state()["n_buckets"] + preallocated_table.map_locations(keys) + n_buckets_end = preallocated_table.get_state()["n_buckets"] + # orgininal number of buckets was enough: + assert n_buckets_start == n_buckets_end + # check with clean table (not too much preallocated) + clean_table = ht.StringHashTable() + clean_table.map_locations(keys) + assert n_buckets_start == clean_table.get_state()["n_buckets"] + + @pytest.mark.parametrize( "table_type, dtype", [ diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 9cd13b2312ea7..b4848f80e9a2c 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -752,6 +752,25 @@ def test_plot_no_numeric_data(self): with pytest.raises(TypeError, match="no numeric data to plot"): df.plot() + @pytest.mark.parametrize( + "data, index", + [ + ([1, 2, 3, 4], [3, 2, 1, 0]), + ([10, 50, 20, 30], [1910, 1920, 1980, 1950]), + ], + ) + def test_plot_order(self, data, index): + # GH38865 Verify plot order of a Series + ser = Series(data=data, index=index) + ax = ser.plot(kind="bar") + + expected = ser.tolist() + result = [ + patch.get_bbox().ymax + for patch in sorted(ax.patches, key=lambda patch: patch.get_bbox().xmax) + ] + assert expected == result + def test_style_single_ok(self): s = Series([1, 2]) ax = s.plot(style="s", color="C3") diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 86955ac4e4d22..1a10255a81a8c 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -31,15 +31,9 @@ async def test_tab_complete_ipython6_warning(ip): ) await ip.run_code(code) - # TODO: remove it when Ipython updates - # GH 33567, jedi version raises Deprecation warning in Ipython - import jedi - - if jedi.__version__ < "0.17.0": - warning = tm.assert_produces_warning(None) - else: - warning = tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False) - with warning: + # GH 31324 newer jedi version raises Deprecation warning; + # appears resolved 2021-02-02 + with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("rs.", 1)) diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index ffeda703cd890..dd6dbd79113e5 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -365,13 +365,25 @@ def test_append_empty_tz_frame_with_datetime64ns(self): # pd.NaT gets inferred as tz-naive, so append result is tz-naive result = df.append({"a": pd.NaT}, ignore_index=True) - expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + expected = DataFrame({"a": [pd.NaT]}).astype(object) tm.assert_frame_equal(result, expected) # also test with typed value to append df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") - result = df.append( - Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True - ) - expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + other = Series({"a": pd.NaT}, dtype="datetime64[ns]") + result = df.append(other, ignore_index=True) + expected = DataFrame({"a": [pd.NaT]}).astype(object) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"] + ) + def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str): + # https://github.com/pandas-dev/pandas/issues/35460 + df = DataFrame(columns=["a"]).astype(dtype_str) + + other = DataFrame({"a": [np.timedelta64("NaT", "ns")]}) + result = df.append(other, ignore_index=True) + + expected = other.astype(object) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index 6dae28003d3b6..357274b66332f 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -42,6 +42,7 @@ def test_categorical_concat(self, sort): "h": [None] * 6 + cat_values, } ) + exp["h"] = exp["h"].astype(df2["h"].dtype) tm.assert_frame_equal(res, exp) def test_categorical_concat_dtypes(self): diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 89b45b7266daa..906ed038c4840 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -357,6 +357,68 @@ def test_round_invalid(self): with pytest.raises(ValueError, match=msg): t1.round(freq) + def test_round_implementation_bounds(self): + # See also: analogous test for Timestamp + # GH#38964 + result = Timedelta.min.ceil("s") + expected = Timedelta.min + Timedelta(seconds=1) - Timedelta(145224193) + assert result == expected + + result = Timedelta.max.floor("s") + expected = Timedelta.max - Timedelta(854775807) + assert result == expected + + with pytest.raises(OverflowError, match="value too large"): + Timedelta.min.floor("s") + + # the second message here shows up in windows builds + msg = "|".join( + ["Python int too large to convert to C long", "int too big to convert"] + ) + with pytest.raises(OverflowError, match=msg): + Timedelta.max.ceil("s") + + @pytest.mark.parametrize("n", range(100)) + @pytest.mark.parametrize( + "method", [Timedelta.round, Timedelta.floor, Timedelta.ceil] + ) + def test_round_sanity(self, method, n, request): + iinfo = np.iinfo(np.int64) + val = np.random.randint(iinfo.min + 1, iinfo.max, dtype=np.int64) + td = Timedelta(val) + + assert method(td, "ns") == td + + res = method(td, "us") + nanos = 1000 + assert np.abs((res - td).value) < nanos + assert res.value % nanos == 0 + + res = method(td, "ms") + nanos = 1_000_000 + assert np.abs((res - td).value) < nanos + assert res.value % nanos == 0 + + res = method(td, "s") + nanos = 1_000_000_000 + assert np.abs((res - td).value) < nanos + assert res.value % nanos == 0 + + res = method(td, "min") + nanos = 60 * 1_000_000_000 + assert np.abs((res - td).value) < nanos + assert res.value % nanos == 0 + + res = method(td, "h") + nanos = 60 * 60 * 1_000_000_000 + assert np.abs((res - td).value) < nanos + assert res.value % nanos == 0 + + res = method(td, "D") + nanos = 24 * 60 * 60 * 1_000_000_000 + assert np.abs((res - td).value) < nanos + assert res.value % nanos == 0 + def test_contains(self): # Checking for any NaT-like objects # GH 13603 diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index dbc751dd614a1..c9e505ef4bbaf 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -607,29 +607,6 @@ def test_td64_series_assign_nat(nat_val, should_cast): tm.assert_series_equal(ser, expected) -@pytest.mark.parametrize( - "td", - [ - Timedelta("9 days"), - Timedelta("9 days").to_timedelta64(), - Timedelta("9 days").to_pytimedelta(), - ], -) -def test_append_timedelta_does_not_cast(td): - # GH#22717 inserting a Timedelta should _not_ cast to int64 - expected = Series(["x", td], index=[0, "td"], dtype=object) - - ser = Series(["x"]) - ser["td"] = td - tm.assert_series_equal(ser, expected) - assert isinstance(ser["td"], Timedelta) - - ser = Series(["x"]) - ser.loc["td"] = Timedelta("9 days") - tm.assert_series_equal(ser, expected) - assert isinstance(ser["td"], Timedelta) - - def test_underlying_data_conversion(): # GH 4080 df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) @@ -759,15 +736,11 @@ def test_getitem_unrecognized_scalar(): timedelta_range("0", periods=20, freq="H"), ], ) -def test_slice_with_zero_step_raises(index): - ts = Series(np.arange(20), index) +def test_slice_with_zero_step_raises(index, frame_or_series, indexer_sli): + ts = frame_or_series(np.arange(20), index=index) with pytest.raises(ValueError, match="slice step cannot be zero"): - ts[::0] - with pytest.raises(ValueError, match="slice step cannot be zero"): - ts.loc[::0] - with pytest.raises(ValueError, match="slice step cannot be zero"): - ts.iloc[::0] + indexer_sli(ts)[::0] @pytest.mark.parametrize( @@ -784,7 +757,6 @@ def assert_slices_equivalent(l_slc, i_slc): tm.assert_series_equal(ts[l_slc], expected) tm.assert_series_equal(ts.loc[l_slc], expected) - tm.assert_series_equal(ts.loc[l_slc], expected) keystr1 = str(index[9]) keystr2 = str(index[13]) diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py index 4caf6d03d8d80..10b9360802c1c 100644 --- a/pandas/tests/series/indexing/test_numeric.py +++ b/pandas/tests/series/indexing/test_numeric.py @@ -4,29 +4,24 @@ import pandas._testing as tm -def test_slice_float64(): +def test_slice_float64(frame_or_series): values = np.arange(10.0, 50.0, 2) index = Index(values) start, end = values[[5, 15]] - s = Series(np.random.randn(20), index=index) + data = np.random.randn(20, 3) + if frame_or_series is not DataFrame: + data = data[:, 0] - result = s[start:end] - expected = s.iloc[5:16] - tm.assert_series_equal(result, expected) - - result = s.loc[start:end] - tm.assert_series_equal(result, expected) - - df = DataFrame(np.random.randn(20, 3), index=index) + obj = frame_or_series(data, index=index) - result = df[start:end] - expected = df.iloc[5:16] - tm.assert_frame_equal(result, expected) + result = obj[start:end] + expected = obj.iloc[5:16] + tm.assert_equal(result, expected) - result = df.loc[start:end] - tm.assert_frame_equal(result, expected) + result = obj.loc[start:end] + tm.assert_equal(result, expected) def test_getitem_setitem_slice_bug(): diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 9ace404930876..767b61e31698b 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -9,6 +9,7 @@ MultiIndex, NaT, Series, + Timedelta, Timestamp, date_range, period_range, @@ -281,6 +282,22 @@ def test_setitem_dt64_into_int_series(self, dtype): ser[:-1] = np.array([val, val]) tm.assert_series_equal(ser, expected) + @pytest.mark.parametrize("unique", [True, False]) + @pytest.mark.parametrize("val", [3, 3.0, "3"], ids=type) + def test_setitem_non_bool_into_bool(self, val, indexer_sli, unique): + # dont cast these 3-like values to bool + ser = Series([True, False]) + if not unique: + ser.index = [1, 1] + + indexer_sli(ser)[1] = val + assert type(ser.iloc[1]) == type(val) + + expected = Series([True, val], dtype=object, index=ser.index) + if not unique and indexer_sli is not tm.iloc: + expected = Series([val, val], dtype=object, index=[1, 1]) + tm.assert_series_equal(ser, expected) + class SetitemCastingEquivalents: """ @@ -291,52 +308,71 @@ class SetitemCastingEquivalents: - the setitem does not expand the obj """ - @pytest.fixture(params=[np.nan, np.float64("NaN")]) - def val(self, request): + @pytest.fixture + def is_inplace(self): """ - One python float NaN, one np.float64. Only np.float64 has a `dtype` - attribute. + Indicate that we are not (yet) checking whether or not setting is inplace. """ - return request.param + return None - def check_indexer(self, obj, key, expected, val, indexer): + def check_indexer(self, obj, key, expected, val, indexer, is_inplace): + orig = obj obj = obj.copy() + arr = obj._values + indexer(obj)[key] = val tm.assert_series_equal(obj, expected) - def test_int_key(self, obj, key, expected, val, indexer_sli): + self._check_inplace(is_inplace, orig, arr, obj) + + def _check_inplace(self, is_inplace, orig, arr, obj): + if is_inplace is None: + # We are not (yet) checking whether setting is inplace or not + pass + elif is_inplace: + assert obj._values is arr + else: + # otherwise original array should be unchanged + tm.assert_equal(arr, orig._values) + + def test_int_key(self, obj, key, expected, val, indexer_sli, is_inplace): if not isinstance(key, int): return - self.check_indexer(obj, key, expected, val, indexer_sli) + self.check_indexer(obj, key, expected, val, indexer_sli, is_inplace) + + if indexer_sli is tm.loc: + self.check_indexer(obj, key, expected, val, tm.at, is_inplace) + elif indexer_sli is tm.iloc: + self.check_indexer(obj, key, expected, val, tm.iat, is_inplace) rng = range(key, key + 1) - self.check_indexer(obj, rng, expected, val, indexer_sli) + self.check_indexer(obj, rng, expected, val, indexer_sli, is_inplace) if indexer_sli is not tm.loc: # Note: no .loc because that handles slice edges differently slc = slice(key, key + 1) - self.check_indexer(obj, slc, expected, val, indexer_sli) + self.check_indexer(obj, slc, expected, val, indexer_sli, is_inplace) ilkey = [key] - self.check_indexer(obj, ilkey, expected, val, indexer_sli) + self.check_indexer(obj, ilkey, expected, val, indexer_sli, is_inplace) indkey = np.array(ilkey) - self.check_indexer(obj, indkey, expected, val, indexer_sli) + self.check_indexer(obj, indkey, expected, val, indexer_sli, is_inplace) - def test_slice_key(self, obj, key, expected, val, indexer_sli): + def test_slice_key(self, obj, key, expected, val, indexer_sli, is_inplace): if not isinstance(key, slice): return if indexer_sli is not tm.loc: # Note: no .loc because that handles slice edges differently - self.check_indexer(obj, key, expected, val, indexer_sli) + self.check_indexer(obj, key, expected, val, indexer_sli, is_inplace) ilkey = list(range(len(obj)))[key] - self.check_indexer(obj, ilkey, expected, val, indexer_sli) + self.check_indexer(obj, ilkey, expected, val, indexer_sli, is_inplace) indkey = np.array(ilkey) - self.check_indexer(obj, indkey, expected, val, indexer_sli) + self.check_indexer(obj, indkey, expected, val, indexer_sli, is_inplace) def test_mask_key(self, obj, key, expected, val, indexer_sli): # setitem with boolean mask @@ -347,14 +383,19 @@ def test_mask_key(self, obj, key, expected, val, indexer_sli): indexer_sli(obj)[mask] = val tm.assert_series_equal(obj, expected) - def test_series_where(self, obj, key, expected, val): + def test_series_where(self, obj, key, expected, val, is_inplace): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True + orig = obj obj = obj.copy() + arr = obj._values + res = obj.where(~mask, val) tm.assert_series_equal(res, expected) + self._check_inplace(is_inplace, orig, arr, obj) + def test_index_where(self, obj, key, expected, val, request): if Index(obj).dtype != obj.dtype: pytest.skip("test not applicable for this dtype") @@ -362,8 +403,7 @@ def test_index_where(self, obj, key, expected, val, request): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if obj.dtype == bool and not mask.all(): - # When mask is all True, casting behavior does not apply + if obj.dtype == bool: msg = "Index/Series casting behavior inconsistent GH#38692" mark = pytest.mark.xfail(reason=msg) request.node.add_marker(mark) @@ -371,7 +411,6 @@ def test_index_where(self, obj, key, expected, val, request): res = Index(obj).where(~mask, val) tm.assert_index_equal(res, Index(expected)) - @pytest.mark.xfail(reason="Index/Series casting behavior inconsistent GH#38692") def test_index_putmask(self, obj, key, expected, val): if Index(obj).dtype != obj.dtype: pytest.skip("test not applicable for this dtype") @@ -470,6 +509,28 @@ def test_setitem_empty_series_timestamp_preserves_dtype(self): result = series["timestamp"] assert result == expected + @pytest.mark.parametrize( + "td", + [ + Timedelta("9 days"), + Timedelta("9 days").to_timedelta64(), + Timedelta("9 days").to_pytimedelta(), + ], + ) + def test_append_timedelta_does_not_cast(self, td): + # GH#22717 inserting a Timedelta should _not_ cast to int64 + expected = Series(["x", td], index=[0, "td"], dtype=object) + + ser = Series(["x"]) + ser["td"] = td + tm.assert_series_equal(ser, expected) + assert isinstance(ser["td"], Timedelta) + + ser = Series(["x"]) + ser.loc["td"] = Timedelta("9 days") + tm.assert_series_equal(ser, expected) + assert isinstance(ser["td"], Timedelta) + def test_setitem_scalar_into_readonly_backing_data(): # GH#14359: test that you cannot mutate a read only buffer @@ -500,25 +561,41 @@ def test_setitem_slice_into_readonly_backing_data(): assert not array.any() -@pytest.mark.parametrize( - "key", [0, slice(0, 1), [0], np.array([0]), range(1)], ids=type -) -@pytest.mark.parametrize("dtype", [complex, int, float]) -def test_setitem_td64_into_complex(key, dtype, indexer_sli): - # timedelta64 should not be treated as integers - arr = np.arange(5).astype(dtype) - ser = Series(arr) - td = np.timedelta64(4, "ns") - - indexer_sli(ser)[key] = td - assert ser.dtype == object - assert arr[0] == 0 # original array is unchanged - - if not isinstance(key, int) and not ( - indexer_sli is tm.loc and isinstance(key, slice) - ): - # skip key/indexer_sli combinations that will have mismatched lengths +class TestSetitemTimedelta64IntoNumeric(SetitemCastingEquivalents): + # timedelta64 should not be treated as integers when setting into + # numeric Series + + @pytest.fixture + def val(self): + td = np.timedelta64(4, "ns") + return td + # TODO: could also try np.full((1,), td) + + @pytest.fixture(params=[complex, int, float]) + def dtype(self, request): + return request.param + + @pytest.fixture + def obj(self, dtype): + arr = np.arange(5).astype(dtype) + ser = Series(arr) + return ser + + @pytest.fixture + def expected(self, dtype): + arr = np.arange(5).astype(dtype) ser = Series(arr) - indexer_sli(ser)[key] = np.full((1,), td) - assert ser.dtype == object - assert arr[0] == 0 # original array is unchanged + ser = ser.astype(object) + ser.values[0] = np.timedelta64(4, "ns") + return ser + + @pytest.fixture + def key(self): + return 0 + + @pytest.fixture + def is_inplace(self): + """ + Indicate we do _not_ expect the setting to be done inplace. + """ + return False diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 1a141e3201d57..d683503f22f28 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -341,6 +341,11 @@ def test_astype_unicode(self): reload(sys) sys.setdefaultencoding(former_encoding) + def test_astype_bytes(self): + # GH#39474 + result = Series(["foo", "bar", "baz"]).astype(bytes) + assert result.dtypes == np.dtype("S3") + class TestAstypeCategorical: def test_astype_categorical_invalid_conversions(self): diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index b619c9c9632e3..3f3a3af658969 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -95,6 +95,8 @@ def test_replace_gh5319(self): expected = ser.ffill() result = ser.replace(np.nan) tm.assert_series_equal(result, expected) + + def test_replace_datetime64(self): # GH 5797 ser = pd.Series(pd.date_range("20130101", periods=5)) expected = ser.copy() @@ -104,6 +106,7 @@ def test_replace_gh5319(self): result = ser.replace(pd.Timestamp("20130103"), pd.Timestamp("20120101")) tm.assert_series_equal(result, expected) + def test_replace_nat_with_tz(self): # GH 11792: Test with replacing NaT in a list with tz data ts = pd.Timestamp("2015/01/01", tz="UTC") s = pd.Series([pd.NaT, pd.Timestamp("2015/01/01", tz="UTC")]) diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 8d9b54cf3f0df..edb0f8c7dd662 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -10,6 +10,7 @@ import warnings from hypothesis import assume, given, strategies as st +from hypothesis.errors import Flaky from hypothesis.extra.dateutil import timezones as dateutil_timezones from hypothesis.extra.pytz import timezones as pytz_timezones import pytest @@ -103,6 +104,7 @@ def test_on_offset_implementations(dt, offset): assert offset.is_on_offset(dt) == (compare == dt) +@pytest.mark.xfail(strict=False, raises=Flaky, reason="unreliable test timings") @given(gen_yqm_offset) def test_shift_across_dst(offset): # GH#18319 check that 1) timezone is correctly normalized and diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index c1621669bffd0..5f7f1b898877c 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -11,12 +11,11 @@ from pandas import Timedelta, Timestamp import pandas._testing as tm +from pandas.tests.tseries.offsets.common import assert_offset_equal from pandas.tseries import offsets from pandas.tseries.offsets import Hour, Micro, Milli, Minute, Nano, Second -from .common import assert_offset_equal - # --------------------------------------------------------------------- # Test Helpers diff --git a/pandas/tests/util/test_assert_attr_equal.py b/pandas/tests/util/test_assert_attr_equal.py new file mode 100644 index 0000000000000..6fad38c2cd44e --- /dev/null +++ b/pandas/tests/util/test_assert_attr_equal.py @@ -0,0 +1,30 @@ +from types import SimpleNamespace + +import pytest + +from pandas.core.dtypes.common import is_float + +import pandas._testing as tm + + +def test_assert_attr_equal(nulls_fixture): + obj = SimpleNamespace() + obj.na_value = nulls_fixture + assert tm.assert_attr_equal("na_value", obj, obj) + + +def test_assert_attr_equal_different_nulls(nulls_fixture, nulls_fixture2): + obj = SimpleNamespace() + obj.na_value = nulls_fixture + + obj2 = SimpleNamespace() + obj2.na_value = nulls_fixture2 + + if nulls_fixture is nulls_fixture2: + assert tm.assert_attr_equal("na_value", obj, obj2) + elif is_float(nulls_fixture) and is_float(nulls_fixture2): + # we consider float("nan") and np.float64("nan") to be equivalent + assert tm.assert_attr_equal("na_value", obj, obj2) + else: + with pytest.raises(AssertionError, match='"na_value" are different'): + tm.assert_attr_equal("na_value", obj, obj2) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 7d3c29dc60be0..300f3f5729614 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -325,3 +325,17 @@ def test_is_datetimelike_deprecated(): s = Series(range(1)).rolling(1) with tm.assert_produces_warning(FutureWarning): assert not s.is_datetimelike + + +@pytest.mark.filterwarnings("ignore:min_periods:FutureWarning") +def test_dont_modify_attributes_after_methods( + arithmetic_win_operators, closed, center, min_periods +): + # GH 39554 + roll_obj = Series(range(1)).rolling( + 1, center=center, closed=closed, min_periods=min_periods + ) + expected = {attr: getattr(roll_obj, attr) for attr in roll_obj._attributes} + getattr(roll_obj, arithmetic_win_operators)() + result = {attr: getattr(roll_obj, attr) for attr in roll_obj._attributes} + assert result == expected diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 94bc755f300a2..fd4dfa7b7ed2b 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -159,7 +159,10 @@ def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs) # Check that the function output matches applying an alternative function # if min_periods isn't specified - rolling3 = constructor(values).rolling(window=indexer) + # GH 39604: After count-min_periods deprecation, apply(lambda x: len(x)) + # is equivalent to count after setting min_periods=0 + min_periods = 0 if func == "count" else None + rolling3 = constructor(values).rolling(window=indexer, min_periods=min_periods) result3 = getattr(rolling3, func)() expected3 = constructor(rolling3.apply(lambda x: np_func(x, **np_kwargs))) tm.assert_equal(result3, expected3) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index b89fb35ac3a70..d3c2b5467e5bb 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -117,6 +117,9 @@ def func(x): return getattr(x.rolling(4), f)(self.frame) expected = g.apply(func) + # GH 39591: The grouped column should be all np.nan + # (groupby.apply inserts 0s for cov) + expected["A"] = np.nan tm.assert_frame_equal(result, expected) result = getattr(r.B, f)(pairwise=True) @@ -688,6 +691,13 @@ def func(x): return getattr(x.expanding(), f)(self.frame) expected = g.apply(func) + # GH 39591: groupby.apply returns 1 instead of nan for windows + # with all nan values + null_idx = list(range(20, 61)) + list(range(72, 113)) + expected.iloc[null_idx, 1] = np.nan + # GH 39591: The grouped column should be all np.nan + # (groupby.apply inserts 0s for cov) + expected["A"] = np.nan tm.assert_frame_equal(result, expected) result = getattr(r.B, f)(pairwise=True) diff --git a/versioneer.py b/versioneer.py index e7fed874ae20f..68c9bb161f206 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1776,7 +1776,7 @@ def make_release_tree(self, base_dir, files): """ INIT_PY_SNIPPET = """ -from ._version import get_versions +from pandas._version import get_versions __version__ = get_versions()['version'] del get_versions """ From a3b636980aa38cb2ecc9dda504741b63302b5f08 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 7 Feb 2021 12:18:12 -0500 Subject: [PATCH 17/18] Added xlsx test file --- .../io/data/excel/empty_with_blank_row.xlsx | Bin 0 -> 4301 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/tests/io/data/excel/empty_with_blank_row.xlsx diff --git a/pandas/tests/io/data/excel/empty_with_blank_row.xlsx b/pandas/tests/io/data/excel/empty_with_blank_row.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..fe3bcfcc269d77e56e9bd99a4225e66a38bb6f6e GIT binary patch literal 4301 zcmZ`-by!qe`yCiM1ZC)uMnpnD8cFGy!I4Hf9lAuiJEc>}OCyMsbV+weH`3im{D$lA zd-U>s_nhaP{hUA6KKotgeb;{1QIrRwfB*mt48VEG3z%`fcjY_)01!k50I(6Gj;)co z9Xs3Yy(p?*4)_#5VAm};(#5fYP|~ohE^r`O#?3RK=UR_9k%QF1&BX^pBQG00Le~AT zmscWb=7I5XaEf%WlvTb4OKA8diA(0kd#9V^b!Mu)bbTsv^mqnc`6I37N7PZ4NT10$ z#$Uf|R?`+()Xk=-4ltZCbn&&&(gEHWuAKztoy?lf{tQzIIQ5M$$=j3)a9q|#mf{~= zRLzKFJ$)p_&Q*IK+AsM^IgJcbW6th)?gGJhG^SZPA83NT&zc{@L=5R?K^9$s5pcAiziBM+^qgTjwv`XL124)a_wO)W%9rB%W zPMq)@ZFz?aWTIx-AmfyCD(8?Rwc9qRweUoI2A&KB5W)rJpG=!R7;v`YO^JrvM1sge z1Qg&)XzM=qxRtO}Lm$^fdX}Af(nx4RVJ=ElhMZSyW}D9VU&Vrcf|-3lxQIbhXM`|} zTJC!q@!p8dJm6!c&h_v$d^$)&@rPovW-VB?;pA~Z)$mBb41P{$Vw4n-Sum($KQ7~D_P$Xz^P~hG z4?p%?ZK9LLY>L@BWs_REa!<2fqmW`Xo9|@X)|7!j!qa|QRn&M5u^u+viQbKwgRI`? z!_K>dE=C3_G(6aIl}@P7DkWw-Gi36>z3&yu4lN8ki7SvBH7;J_D_kXEw+Z%)8YXiF4FCcJf4C8N~5{E44$PBBdcZ?B0q!M1OA%6Wn6q)u4;zN?2RtSel- zGaD8DWhjSz!m;DMz#@$E(>%t45XmXChp1_m!efcPv3>e^LyvaNVVc=DblP9U*($3C zVc&3i7c{~GGdgYvCn|iLmlL;BS54Bf+=5Aq)((ES+kQ8Bsrv#uCER#v))8G_RjA}vsS+?G?(eF#;o5ObT%L#uCe)>9HfA0UQb zQDXfqf@sD`IcmiIMji|^L=(8;V}81;E7{{-`%!AxkS7dnP=ErrdG9Uf83!2@ZQUYi zXg&(e3H697q}cFwmzAE&NVi%>>e}{GaoM4-cpKx|wQ)_!)u7tCLHy4g35EKDd=Tnj za327`{goqj_D<$TcDDtwtp?{r z%k=E%CqHD^x0%r)fH03~ioN0V@?b0XZFETN{MvhXOH9hb+N0LME`=3^I4;+k1P`H> z4mDc4=GZK*X+i?dsjeBhr=2Z`;E)`iCbpfyKewZVXdM z<{qnco6gymq8mxPb}N#F+C%rwrsG1>t2siQfGi)QrY#>R;5!OeANkH z?{`L)IG}?fX<;z2tY30UhL+uiYdsDe120;`%IDO;*p}naKQXnT$S3hXr&dpE`_ChJ zQfZeJ##t8LXg6EC>yN!CElV{bJ$QZlgT?J<%B9<`YpV~78hp6i243)2Q6HB=_5m+; z^7&ilW>S;!qH1(FEfbG2b(5{LeW^Du)(j>2+@n3+{TrNz>?M=xNs<`znqx3tye<-{n!GHQ-|S04^>V9 z&r;-3S_soHA95@mF}xwLVXj!GA&hP_VNoH&`-LkW&vljSd&0N9GuEvvT`u69Hjm}!X2!Va+gT_Wn983CcV0LNV9D;+FG)<|fXuz8c)_oiLT{Un@JL3LwU z7pFuUF@__H2pmwznb7gh{FQX$VE6|*%|#pVGQ86~fH3r9*nGS=l6q>M<`OQI>Z1ll zLKziVnAhgWQ_aGTF;9HmUO+&{Z!@B(J}!QXYEv<8Y?DbZ z;N2>i%5g&w^m(s*vHkAjlt4U!$NQBMkVy4zgP%@%? zrF{f397F<-O8pqKqrpK^LTR=N#inJRq~7PpYm(O^@41&w!V>G!8aZX01o&NCc&v_v zDP)oK>iSGft^&GBC8W=9l5=nlxd5O*OMA%0%TLdO6nrLv(b)~624n{OIL4eNv`0t# zbf8PdtoCwmBI>voJ6bFi0(`BO%0tvLZTvB#jv${ecV6#5%(lMz zE6B7o98HkP>r=^r;MB#0l3@XB;lnbQ>0oSDqWvIf)@GA#=S%FlVGd?+`B=Ktec~5l zZZNiaycmH)|3heim~S@@zt!020Ykx2fbyC9d4bgFLR>{wSg|SQtZv�r~ha5qB6@ zZG2yIzf&9z2URJ};SIhUIAhP1QdhI^3F)eA`?b!y^F_Sh1$^6(?ktsTDI!{uK*W}V z*l_@cRtAc;R@Qdx23EF4x0P!F)3oyD0J+b7qIL@)>iHoKh_={&!ze#uKB4)Ys+2tD zFX7w7pcP^jLRgnP+dwO&jgrkT4ewl+^}^{rp-~IS|%@vVunv-xcvOy zbY+W-1dN1Xe#BEGCAp2L&t+m|XgXQuFG;^y%Y9(B3hR2gK11hroL)f*E!6Y@A z=m__vWIxEP~eWq}N`CMNRnBE7{?pPAZohwD4<(~eJS2@kM{q?9*jdKHY7@qNXE zgng^9$b37bd3E{JO;d>dXMw+*Bb31tH1Q@n$Pl6%RQ2OgPh{&(TX_W<28+d*k;9DJ5NLa|_hhv_i#IY>wAW*aHjB6Pi-_nnrDu zY?nj4Sj#PgCa)KSjPyQzjewQRK5Z`cn{spYZcM+ya1Fy^Az8CK-#rBVp`W+(Hz|JY z+Cb3mjQ9ljf6}jKZGFprY9!RGg#)K;_a~_7D2Wf7CT6j!FeVjiiiW%fwO>U8@1{jX z?qYk@V&J4U`NM}ZS!eSawO1)LjN4jJs)8iYe3t4YrON@2P`nw89t6Q$qC8qg4l+Ln zx`aI!mBh{KK#6y@6~`f$ix%=C%B4r1iwCv6M)Q zMGvH>Q}!DaS62Gn*4?UndzQ=r~>g38ip|dR{b=lteFz zZqZ~iUf?D>iNA$wP!AWgfrg=m#ZL0I0o}j)t84c<0k>@VYV~6Wk?cDpQpCQsQI5C{ z`fa)UO_8&8$-b>w56O+PHWO6e_cS6ZC{$fdJCyd7XFZu-(KG?Y=CxN2U4*ot6eJ6K zDm_)7-c_wtUyIK|hJV;|6lA7Owk4*jZbmzK5Y6HIL&z;lxzewWoY&)-F{JeTW24Ud zoIwSxZYiRNtRC&CSGlWM$LIwf;-J>Gt|Ks?> zvEBvV^>n{cn21mL?+)+oJa=us+L=tl^X@jt3dQ63c$GvdDm;v_`u4H@@tpZ*V(X?ZLF literal 0 HcmV?d00001 From 3e9bd4f34999bf079b7500bedf17ebf5d311aeb1 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 8 Feb 2021 07:14:52 -0500 Subject: [PATCH 18/18] xfail and improve tests --- pandas/tests/io/excel/test_openpyxl.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index e94f06900a4c0..0962b719efd4d 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -191,10 +191,21 @@ def test_append_mode_file(ext): assert second != -1 and third == -1 -def test_read_with_empty_trailing_rows(datapath, ext): +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_with_empty_trailing_rows(datapath, ext, read_only, request): # GH 39181 + version = LooseVersion(get_version(openpyxl)) + if (read_only or read_only is None) and version < "3.0.0": + msg = "openpyxl read-only sheet is incorrect when dimension data is wrong" + request.node.add_marker(pytest.mark.xfail(reason=msg)) path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}") - result = pd.read_excel(path) + if read_only is None: + result = pd.read_excel(path) + else: + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() expected = DataFrame( { "Title": [np.nan, "A", 1, 2, 3], @@ -205,6 +216,7 @@ def test_read_with_empty_trailing_rows(datapath, ext): tm.assert_frame_equal(result, expected) +# When read_only is None, use read_excel instead of a workbook @pytest.mark.parametrize("read_only", [True, False, None]) def test_read_empty_with_blank_row(datapath, ext, read_only): # GH 39547 - empty excel file with a row that has no data