From 367c06c45ed8d8dcc5e21a6f81cc7b39b2e0ae7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jure=20=C5=A0orn?= <sornjure@gmail.com> Date: Thu, 7 Nov 2024 13:42:53 +0100 Subject: [PATCH] OS Commands, big changes to Pandas --- README.md | 189 +++++++++++++++++++++++++-------------------------- index.html | 195 ++++++++++++++++++++++++++--------------------------- parse.js | 41 +++++------ 3 files changed, 208 insertions(+), 217 deletions(-) diff --git a/README.md b/README.md index e905c08..070fda5 100644 --- a/README.md +++ b/README.md @@ -1740,9 +1740,9 @@ shutil.rmtree(<path>) # Deletes the directory. ### Shell Commands ```python -<pipe> = os.popen('<command>') # Executes command in sh/cmd. Returns its stdout pipe. +<pipe> = os.popen('<commands>') # Executes commands in sh/cmd. Returns combined stdout. <str> = <pipe>.read(size=-1) # Reads 'size' chars or until EOF. Also readline/s(). -<int> = <pipe>.close() # Closes the pipe. Returns None on success (returncode 0). +<int> = <pipe>.close() # Returns None if last command exited with returncode 0. ``` #### Sends '1 + 1' to the basic calculator and captures its output: @@ -3146,6 +3146,8 @@ if __name__ == '__main__': Pandas ------ +**Data analysis library. For examples see [Plotly](#displaysalinechartoftotalcoronavirusdeathspermilliongroupedbycontinent).** + ```python # $ pip3 install pandas matplotlib import pandas as pd, matplotlib.pyplot as plt @@ -3155,65 +3157,61 @@ import pandas as pd, matplotlib.pyplot as plt **Ordered dictionary with a name.** ```python ->>> sr = pd.Series([1, 2], index=['x', 'y'], name='a'); sr +>>> s = pd.Series([1, 2], index=['x', 'y'], name='a'); s x 1 y 2 Name: a, dtype: int64 ``` ```python -<Sr> = pd.Series(<list>) # Assigns RangeIndex starting at 0. -<Sr> = pd.Series(<dict>) # Takes dictionary's keys for index. -<Sr> = pd.Series(<dict/Series>, index=<list>) # Only keeps items with keys specified in index. +<S> = pd.Series(<list>) # Assigns RangeIndex starting at 0. +<S> = pd.Series(<dict>) # Takes dictionary's keys for index. +<S> = pd.Series(<dict/Series>, index=<list>) # Only keeps items with keys specified in index. ``` ```python -<el> = <Sr>.loc[key] # Or: <Sr>.iloc[i] -<Sr> = <Sr>.loc[coll_of_keys] # Or: <Sr>.iloc[coll_of_i] -<Sr> = <Sr>.loc[from_key : to_key_inc] # Or: <Sr>.iloc[from_i : to_i_exc] +<el> = <S>.loc[key] # Or: <S>.iloc[i] +<S> = <S>.loc[coll_of_keys] # Or: <S>.iloc[coll_of_i] +<S> = <S>.loc[from_key : to_key_inc] # Or: <S>.iloc[from_i : to_i_exc] ``` ```python -<el> = <Sr>[key/i] # Or: <Sr>.<key> -<Sr> = <Sr>[coll_of_keys/coll_of_i] # Or: <Sr>[key/i : key/i] -<Sr> = <Sr>[bools] # Or: <Sr>.loc/iloc[bools] +<el> = <S>[key/i] # Or: <S>.<key> +<S> = <S>[coll_of_keys/coll_of_i] # Or: <S>[key/i : key/i] +<S> = <S>[bools] # Or: <S>.loc/iloc[bools] ``` ```python -<Sr> = <Sr> > <el/Sr> # Returns a Series of bools. -<Sr> = <Sr> + <el/Sr> # Items with non-matching keys get value NaN. +<S> = <S> > <el/S> # Returns a Series of bools. +<S> = <S> + <el/S> # Items with non-matching keys get value NaN. ``` ```python -<Sr> = pd.concat(<coll_of_Sr>) # Concats multiple series into one long Series. -<Sr> = <Sr>.combine_first(<Sr>) # Adds items that are not yet present. -<Sr>.update(<Sr>) # Updates items that are already present. +<S> = pd.concat(<coll_of_S>) # Concats multiple series into one long Series. +<S> = <S>.combine_first(<S>) # Adds items that are not yet present. +<S>.update(<S>) # Updates items that are already present. ``` ```python -<Sr>.plot.line/area/bar/pie/hist() # Generates a Matplotlib plot. +<S>.plot.line/area/bar/pie/hist() # Generates a Matplotlib plot. plt.show() # Displays the plot. Also plt.savefig(<path>). ``` +* **Indexing objects can't be tuples because `'obj[x, y]'` is converted to `'obj[(x, y)]'`!** +* **Pandas uses NumPy types like `'np.int64'`. Series is converted to `'float64'` if we assign np.nan to any item. Use `'<S>.astype(<str/type>)'` to get converted Series.** #### Series — Aggregate, Transform, Map: ```python -<el> = <Sr>.sum/max/mean/idxmax/all() # Or: <Sr>.agg(lambda <Sr>: <el>) -<Sr> = <Sr>.rank/diff/cumsum/ffill/interpo…() # Or: <Sr>.agg/transform(lambda <Sr>: <Sr>) -<Sr> = <Sr>.fillna(<el>) # Or: <Sr>.agg/transform/map(lambda <el>: <el>) -``` - -```python ->>> sr = pd.Series([2, 3], index=['x', 'y']); sr -x 2 -y 3 +<el> = <S>.sum/max/mean/idxmax/all() # Or: <S>.agg(lambda <S>: <el>) +<S> = <S>.rank/diff/cumsum/ffill/interpol…() # Or: <S>.agg/transform(lambda <S>: <S>) +<S> = <S>.isna/fillna/isin([<el/coll>]) # Or: <S>.agg/transform/map(lambda <el>: <el>) ``` ```text +---------------+-------------+-------------+---------------+ | | 'sum' | ['sum'] | {'s': 'sum'} | +---------------+-------------+-------------+---------------+ -| sr.apply(…) | 5 | sum 5 | s 5 | -| sr.agg(…) | | | | +| s.apply(…) | 3 | sum 3 | s 3 | +| s.agg(…) | | | | +---------------+-------------+-------------+---------------+ ``` @@ -3221,14 +3219,13 @@ y 3 +---------------+-------------+-------------+---------------+ | | 'rank' | ['rank'] | {'r': 'rank'} | +---------------+-------------+-------------+---------------+ -| sr.apply(…) | | rank | | -| sr.agg(…) | x 1 | x 1 | r x 1 | -| | y 2 | y 2 | y 2 | +| s.apply(…) | | rank | | +| s.agg(…) | x 1.0 | x 1.0 | r x 1.0 | +| | y 2.0 | y 2.0 | y 2.0 | +---------------+-------------+-------------+---------------+ ``` -* **Indexing objects can't be tuples because `'obj[x, y]'` is converted to `'obj[(x, y)]'`!** * **Methods ffill(), interpolate(), fillna() and dropna() accept `'inplace=True'`.** -* **Last result has a hierarchical index. Use `'<Sr>[key_1, key_2]'` to get its values.** +* **Last result has a multi-index. Use `'<S>[key_1, key_2]'` to get its values.** ### DataFrame **Table with labeled rows and columns.** @@ -3241,33 +3238,39 @@ b 3 4 ``` ```python -<DF> = pd.DataFrame(<list_of_rows>) # Rows can be either lists, dicts or series. -<DF> = pd.DataFrame(<dict_of_columns>) # Columns can be either lists, dicts or series. +<DF> = pd.DataFrame(<list_of_rows>) # Rows can be either lists, dicts or series. +<DF> = pd.DataFrame(<dict_of_columns>) # Columns can be either lists, dicts or series. ``` ```python -<el> = <DF>.loc[row_key, col_key] # Or: <DF>.iloc[row_i, col_i] -<Sr/DF> = <DF>.loc[row_key/s] # Or: <DF>.iloc[row_i/s] -<Sr/DF> = <DF>.loc[:, col_key/s] # Or: <DF>.iloc[:, col_i/s] -<DF> = <DF>.loc[row_bools, col_bools] # Or: <DF>.iloc[row_bools, col_bools] +<el> = <DF>.loc[row_key, col_key] # Or: <DF>.iloc[row_i, col_i] +<S/DF> = <DF>.loc[row_key/s] # Or: <DF>.iloc[row_i/s] +<S/DF> = <DF>.loc[:, col_key/s] # Or: <DF>.iloc[:, col_i/s] +<DF> = <DF>.loc[row_bools, col_bools] # Or: <DF>.iloc[row_bools, col_bools] ``` ```python -<Sr/DF> = <DF>[col_key/s] # Or: <DF>.<col_key> -<DF> = <DF>[row_bools] # Keeps rows as specified by bools. -<DF> = <DF>[<DF_of_bools>] # Assigns NaN to items that are False in bools. +<S/DF> = <DF>[col_key/s] # Or: <DF>.<col_key> +<DF> = <DF>[row_bools] # Keeps rows as specified by bools. +<DF> = <DF>[<DF_of_bools>] # Assigns NaN to items that are False in bools. ``` ```python -<DF> = <DF> > <el/Sr/DF> # Returns DF of bools. Sr is treated as a row. -<DF> = <DF> + <el/Sr/DF> # Items with non-matching keys get value NaN. +<DF> = <DF> > <el/S/DF> # Returns DF of bools. S is treated as a row. +<DF> = <DF> + <el/S/DF> # Items with non-matching keys get value NaN. ``` ```python -<DF> = <DF>.set_index(col_key) # Replaces row keys with column's values. -<DF> = <DF>.reset_index(drop=False) # Drops or moves row keys to column named index. -<DF> = <DF>.sort_index(ascending=True) # Sorts rows by row keys. Use `axis=1` for cols. -<DF> = <DF>.sort_values(col_key/s) # Sorts rows by passed column/s. Also `axis=1`. +<DF> = <DF>.set_index(col_key) # Replaces row keys with column's values. +<DF> = <DF>.reset_index(drop=False) # Drops or moves row keys to column named index. +<DF> = <DF>.sort_index(ascending=True) # Sorts rows by row keys. Use `axis=1` for cols. +<DF> = <DF>.sort_values(col_key/s) # Sorts rows by passed column/s. Also `axis=1`. +``` + +```python +<DF> = <DF>.head/tail/sample(<int>) # Returns first, last, or random n elements. +<DF> = <DF>.describe() # Describes columns. Also shape, info(), corr(). +<DF> = <DF>.query('<query>') # Filters rows with e.g. 'col_1 == val_1 and …'. ``` ```python @@ -3301,41 +3304,28 @@ c 6 7 | axis=0, | a 1 2 . | 2 | | Uses 'outer' by default. | | join=…) | b 3 4 . | 4 | | A Series is treated as a | | | b . 4 5 | 4 | | column. To add a row use | -| | c . 6 7 | 6 | | pd.concat([l, DF([sr])]).| +| | c . 6 7 | 6 | | pd.concat([l, DF([s])]). | +------------------------+---------------+------------+------------+--------------------------+ | pd.concat([l, r], | x y y z | | | Adds columns at the | | axis=1, | a 1 2 . . | x y y z | | right end. Uses 'outer' | | join=…) | b 3 4 4 5 | 3 4 4 5 | | by default. A Series is | | | c . . 6 7 | | | treated as a column. | +------------------------+---------------+------------+------------+--------------------------+ -| l.combine_first(r) | x y z | | | Adds missing rows and | -| | a 1 2 . | | | columns. Also updates | -| | b 3 4 5 | | | items that contain NaN. | -| | c . 6 7 | | | Argument r must be a DF. | -+------------------------+---------------+------------+------------+--------------------------+ ``` #### DataFrame — Aggregate, Transform, Map: ```python -<Sr> = <DF>.sum/max/mean/idxmax/all() # Or: <DF>.apply/agg(lambda <Sr>: <el>) -<DF> = <DF>.rank/diff/cumsum/ffill/interpo…() # Or: <DF>.apply/agg/transfo…(lambda <Sr>: <Sr>) -<DF> = <DF>.fillna(<el>) # Or: <DF>.applymap(lambda <el>: <el>) -``` -* **All operations operate on columns by default. Pass `'axis=1'` to process the rows instead.** - -```python ->>> df = pd.DataFrame([[1, 2], [3, 4]], index=['a', 'b'], columns=['x', 'y']); df - x y -a 1 2 -b 3 4 +<S> = <DF>.sum/max/mean/idxmax/all() # Or: <DF>.apply/agg(lambda <S>: <el>) +<DF> = <DF>.rank/diff/cumsum/ffill/interpo…() # Or: <DF>.apply/agg/transform(lambda <S>: <S>) +<DF> = <DF>.isna/fillna/isin([<el/coll>]) # Or: <S>.agg/transform/map(lambda <el>: <el>) ``` ```text +-----------------+-------------+-------------+---------------+ | | 'sum' | ['sum'] | {'x': 'sum'} | +-----------------+-------------+-------------+---------------+ -| df.apply(…) | x 4 | x y | x 4 | -| df.agg(…) | y 6 | sum 4 6 | | +| l.apply(…) | x 4 | x y | x 4 | +| l.agg(…) | y 6 | sum 4 6 | | +-----------------+-------------+-------------+---------------+ ``` @@ -3343,16 +3333,25 @@ b 3 4 +-----------------+-------------+-------------+---------------+ | | 'rank' | ['rank'] | {'x': 'rank'} | +-----------------+-------------+-------------+---------------+ -| df.apply(…) | | x y | | -| df.agg(…) | x y | rank rank | x | -| df.transform(…) | a 1 1 | a 1 1 | a 1 | -| | b 2 2 | b 2 2 | b 2 | +| l.apply(…) | | x y | | +| l.agg(…) | x y | rank rank | x | +| l.transform(…) | a 1.0 1.0 | a 1.0 1.0 | a 1.0 | +| | b 2.0 2.0 | b 2.0 2.0 | b 2.0 | +-----------------+-------------+-------------+---------------+ ``` -* **Use `'<DF>[col_key_1, col_key_2][row_key]'` to get the fifth result's values.** +* **All methods operate on columns by default. Pass `'axis=1'` to process the rows instead.** +* **Fifth result's columns are indexed with a multi-index. This means we need a tuple of column keys to specify a single column: `'<DF>.loc[row_k, (col_k_1, col_k_2)]'`.** -#### DataFrame — Encode, Decode: +#### DataFrame — Multi-Index: +```python +<DF> = <DF>.xs(row_key, level=<int>) # Rows with key on passed level of multi-index. +<DF> = <DF>.xs(row_keys, level=<ints>) # Rows that have first key on first level, etc. +<DF> = <DF>.set_index(col_keys) # Combines multiple columns into a multi-index. +<S/DF> = <DF>.stack/unstack(level=-1) # Combines col keys with row keys or vice versa. +<DF> = <DF>.pivot_table(index=col_key/s, …) # `columns=col_key/s, values=col_key/s`. +``` +#### DataFrame — Encode, Decode: ```python <DF> = pd.read_json/html('<str/path/url>') # Run `$ pip3 install beautifulsoup4 lxml`. <DF> = pd.read_csv('<path/url>') # `header/index_col/dtype/usecols/…=<obj>`. @@ -3367,53 +3366,49 @@ b 3 4 <DF>.to_sql('<table_name>', <connection>) # Also `if_exists='fail/replace/append'`. ``` * **Read\_csv() only parses dates of columns that were specified by 'parse\_dates' argument. It automatically tries to detect the format, but it can be helped with 'date\_format' or 'datefirst' arguments. Both dates and datetimes get stored as pd.Timestamp objects.** -* **If there's a single invalid date then it returns the whole column as a series of strings, unlike `'<Sr> = pd.to_datetime(<Sr>, errors="coerce")'`, which uses pd.NaT.** -* **To get specific attributes from a series of Timestamps use `'<Sr>.dt.year/date/…'`.** +* **If there's a single invalid date then it returns the whole column as a series of strings, unlike `'<S> = pd.to_datetime(<S>, errors="coerce")'`, which uses pd.NaT.** +* **To get specific attributes from a series of Timestamps use `'<S>.dt.year/date/…'`.** ### GroupBy **Object that groups together rows of a dataframe based on the value of the passed column.** -```python ->>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 6]], list('abc'), list('xyz')) ->>> gb = df.groupby('z'); gb.apply(print) - x y z -a 1 2 3 - x y z -b 4 5 6 -c 7 8 6 -``` - ```python <GB> = <DF>.groupby(col_key/s) # Splits DF into groups based on passed column. -<DF> = <GB>.apply(<func>) # Maps each group. Func can return DF, Sr or el. +<DF> = <GB>.apply(<func>) # Maps each group. Func can return DF, S or el. <DF> = <GB>.get_group(<el>) # Selects a group by grouping column's value. -<Sr> = <GB>.size() # A Sr of group sizes. Same keys as get_group(). -<GB> = <GB>[col_key] # Single column GB. All operations return a Sr. +<S> = <GB>.size() # S of group sizes. Same keys as get_group(). +<GB> = <GB>[col_key] # Single column GB. All operations return S. ``` -#### GroupBy — Aggregate, Transform, Map: ```python -<DF> = <GB>.sum/max/mean/idxmax/all() # Or: <GB>.agg(lambda <Sr>: <el>) -<DF> = <GB>.rank/diff/cumsum/ffill() # Or: <GB>.transform(lambda <Sr>: <Sr>) -<DF> = <GB>.fillna(<el>) # Or: <GB>.transform(lambda <Sr>: <Sr>) +<DF> = <GB>.sum/max/mean/idxmax/all() # Or: <GB>.agg(lambda <S>: <el>) +<DF> = <GB>.rank/diff/cumsum/ffill() # Or: <GB>.transform(lambda <S>: <S>) +<DF> = <GB>.fillna(<el>) # Or: <GB>.transform(lambda <S>: <S>) ``` +#### Divides rows into groups and sums their columns. Result has a named index that creates column `'z'` on reset_index(): ```python +>>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 6]], list('abc'), list('xyz')) +>>> gb = df.groupby('z'); gb.apply(print) + x y z +a 1 2 3 + x y z +b 4 5 6 +c 7 8 6 >>> gb.sum() x y z 3 1 2 6 11 13 ``` -* **Result has a named index that creates column `'z'` instead of `'index'` on reset_index().** ### Rolling **Object for rolling window calculations.** ```python -<RSr/RDF/RGB> = <Sr/DF/GB>.rolling(win_size) # Also: `min_periods=None, center=False`. -<RSr/RDF/RGB> = <RDF/RGB>[col_key/s] # Or: <RDF/RGB>.col_key -<Sr/DF> = <R>.mean/sum/max() # Or: <R>.apply/agg(<agg_func/str>) +<RS/RDF/RGB> = <S/DF/GB>.rolling(win_size) # Also: `min_periods=None, center=False`. +<RS/RDF/RGB> = <RDF/RGB>[col_key/s] # Or: <RDF/RGB>.col_key +<S/DF> = <R>.mean/sum/max() # Or: <R>.apply/agg(<agg_func/str>) ``` diff --git a/index.html b/index.html index a297efe..a584141 100644 --- a/index.html +++ b/index.html @@ -55,7 +55,7 @@ <body> <header> - <aside>October 28, 2024</aside> + <aside>November 7, 2024</aside> <a href="https://gto76.github.io" rel="author">Jure Šorn</a> </header> @@ -1452,9 +1452,9 @@ shutil.rmtree(<path>) <span class="hljs-comment"># Deletes t <li><strong>Paths can be either strings, Paths, or DirEntry objects.</strong></li> <li><strong>Functions report OS related errors by raising either OSError or one of its <a href="#exceptions-1">subclasses</a>.</strong></li> </ul> -<div><h3 id="shellcommands">Shell Commands</h3><pre><code class="python language-python hljs"><pipe> = os.popen(<span class="hljs-string">'<command>'</span>) <span class="hljs-comment"># Executes command in sh/cmd. Returns its stdout pipe.</span> +<div><h3 id="shellcommands">Shell Commands</h3><pre><code class="python language-python hljs"><pipe> = os.popen(<span class="hljs-string">'<commands>'</span>) <span class="hljs-comment"># Executes commands in sh/cmd. Returns combined stdout.</span> <str> = <pipe>.read(size=<span class="hljs-number">-1</span>) <span class="hljs-comment"># Reads 'size' chars or until EOF. Also readline/s().</span> -<int> = <pipe>.close() <span class="hljs-comment"># Closes the pipe. Returns None on success (returncode 0).</span> +<int> = <pipe>.close() <span class="hljs-comment"># Returns None if last command exited with returncode 0.</span> </code></pre></div> <div><h4 id="sends11tothebasiccalculatorandcapturesitsoutput">Sends '1 + 1' to the basic calculator and captures its output:</h4><pre><code class="python language-python hljs"><span class="hljs-meta">>>> </span>subprocess.run(<span class="hljs-string">'bc'</span>, input=<span class="hljs-string">'1 + 1\n'</span>, capture_output=<span class="hljs-keyword">True</span>, text=<span class="hljs-keyword">True</span>) @@ -2568,68 +2568,68 @@ W, H, MAX_S = <span class="hljs-number">50</span>, <span class="hljs-number">50< main() </code></pre></div> -<div><h2 id="pandas"><a href="#pandas" name="pandas">#</a>Pandas</h2><pre><code class="python language-python hljs"><span class="hljs-comment"># $ pip3 install pandas matplotlib</span> +<div><h2 id="pandas"><a href="#pandas" name="pandas">#</a>Pandas</h2><p><strong>Data analysis library. For examples see <a href="#displaysalinechartoftotalcoronavirusdeathspermilliongroupedbycontinent">Plotly</a>.</strong></p><pre><code class="python language-python hljs"><span class="hljs-comment"># $ pip3 install pandas matplotlib</span> <span class="hljs-keyword">import</span> pandas <span class="hljs-keyword">as</span> pd, matplotlib.pyplot <span class="hljs-keyword">as</span> plt </code></pre></div> -<div><h3 id="series">Series</h3><p><strong>Ordered dictionary with a name.</strong></p><pre><code class="python language-python hljs"><span class="hljs-meta">>>> </span>sr = pd.Series([<span class="hljs-number">1</span>, <span class="hljs-number">2</span>], index=[<span class="hljs-string">'x'</span>, <span class="hljs-string">'y'</span>], name=<span class="hljs-string">'a'</span>); sr + +<div><h3 id="series">Series</h3><p><strong>Ordered dictionary with a name.</strong></p><pre><code class="python language-python hljs"><span class="hljs-meta">>>> </span>s = pd.Series([<span class="hljs-number">1</span>, <span class="hljs-number">2</span>], index=[<span class="hljs-string">'x'</span>, <span class="hljs-string">'y'</span>], name=<span class="hljs-string">'a'</span>); s x <span class="hljs-number">1</span> y <span class="hljs-number">2</span> Name: a, dtype: int64 </code></pre></div> -<pre><code class="python language-python hljs"><Sr> = pd.Series(<list>) <span class="hljs-comment"># Assigns RangeIndex starting at 0.</span> -<Sr> = pd.Series(<dict>) <span class="hljs-comment"># Takes dictionary's keys for index.</span> -<Sr> = pd.Series(<dict/Series>, index=<list>) <span class="hljs-comment"># Only keeps items with keys specified in index.</span> +<pre><code class="python language-python hljs"><S> = pd.Series(<list>) <span class="hljs-comment"># Assigns RangeIndex starting at 0.</span> +<S> = pd.Series(<dict>) <span class="hljs-comment"># Takes dictionary's keys for index.</span> +<S> = pd.Series(<dict/Series>, index=<list>) <span class="hljs-comment"># Only keeps items with keys specified in index.</span> </code></pre> -<pre><code class="python language-python hljs"><el> = <Sr>.loc[key] <span class="hljs-comment"># Or: <Sr>.iloc[i]</span> -<Sr> = <Sr>.loc[coll_of_keys] <span class="hljs-comment"># Or: <Sr>.iloc[coll_of_i]</span> -<Sr> = <Sr>.loc[from_key : to_key_inc] <span class="hljs-comment"># Or: <Sr>.iloc[from_i : to_i_exc]</span> +<pre><code class="python language-python hljs"><el> = <S>.loc[key] <span class="hljs-comment"># Or: <S>.iloc[i]</span> +<S> = <S>.loc[coll_of_keys] <span class="hljs-comment"># Or: <S>.iloc[coll_of_i]</span> +<S> = <S>.loc[from_key : to_key_inc] <span class="hljs-comment"># Or: <S>.iloc[from_i : to_i_exc]</span> </code></pre> -<pre><code class="python language-python hljs"><el> = <Sr>[key/i] <span class="hljs-comment"># Or: <Sr>.<key></span> -<Sr> = <Sr>[coll_of_keys/coll_of_i] <span class="hljs-comment"># Or: <Sr>[key/i : key/i]</span> -<Sr> = <Sr>[bools] <span class="hljs-comment"># Or: <Sr>.loc/iloc[bools]</span> +<pre><code class="python language-python hljs"><el> = <S>[key/i] <span class="hljs-comment"># Or: <S>.<key></span> +<S> = <S>[coll_of_keys/coll_of_i] <span class="hljs-comment"># Or: <S>[key/i : key/i]</span> +<S> = <S>[bools] <span class="hljs-comment"># Or: <S>.loc/iloc[bools]</span> </code></pre> -<pre><code class="python language-python hljs"><Sr> = <Sr> > <el/Sr> <span class="hljs-comment"># Returns a Series of bools.</span> -<Sr> = <Sr> + <el/Sr> <span class="hljs-comment"># Items with non-matching keys get value NaN.</span> +<pre><code class="python language-python hljs"><S> = <S> > <el/S> <span class="hljs-comment"># Returns a Series of bools.</span> +<S> = <S> + <el/S> <span class="hljs-comment"># Items with non-matching keys get value NaN.</span> </code></pre> -<pre><code class="python language-python hljs"><Sr> = pd.concat(<coll_of_Sr>) <span class="hljs-comment"># Concats multiple series into one long Series.</span> -<Sr> = <Sr>.combine_first(<Sr>) <span class="hljs-comment"># Adds items that are not yet present.</span> -<Sr>.update(<Sr>) <span class="hljs-comment"># Updates items that are already present.</span> +<pre><code class="python language-python hljs"><S> = pd.concat(<coll_of_S>) <span class="hljs-comment"># Concats multiple series into one long Series.</span> +<S> = <S>.combine_first(<S>) <span class="hljs-comment"># Adds items that are not yet present.</span> +<S>.update(<S>) <span class="hljs-comment"># Updates items that are already present.</span> </code></pre> -<pre><code class="python language-python hljs"><Sr>.plot.line/area/bar/pie/hist() <span class="hljs-comment"># Generates a Matplotlib plot.</span> +<pre><code class="python language-python hljs"><S>.plot.line/area/bar/pie/hist() <span class="hljs-comment"># Generates a Matplotlib plot.</span> plt.show() <span class="hljs-comment"># Displays the plot. Also plt.savefig(<path>).</span> </code></pre> -<div><h4 id="seriesaggregatetransformmap">Series — Aggregate, Transform, Map:</h4><pre><code class="python language-python hljs"><el> = <Sr>.sum/max/mean/idxmax/all() <span class="hljs-comment"># Or: <Sr>.agg(lambda <Sr>: <el>)</span> -<Sr> = <Sr>.rank/diff/cumsum/ffill/interpo…() <span class="hljs-comment"># Or: <Sr>.agg/transform(lambda <Sr>: <Sr>)</span> -<Sr> = <Sr>.fillna(<el>) <span class="hljs-comment"># Or: <Sr>.agg/transform/map(lambda <el>: <el>)</span> +<ul> +<li><strong>Indexing objects can't be tuples because <code class="python hljs"><span class="hljs-string">'obj[x, y]'</span></code> is converted to <code class="python hljs"><span class="hljs-string">'obj[(x, y)]'</span></code>!</strong></li> +<li><strong>Pandas uses NumPy types like <code class="python hljs"><span class="hljs-string">'np.int64'</span></code>. Series is converted to <code class="python hljs"><span class="hljs-string">'float64'</span></code> if we assign np.nan to any item. Use <code class="python hljs"><span class="hljs-string">'<S>.astype(<str/type>)'</span></code> to get converted Series.</strong></li> +</ul> +<div><h4 id="seriesaggregatetransformmap">Series — Aggregate, Transform, Map:</h4><pre><code class="python language-python hljs"><el> = <S>.sum/max/mean/idxmax/all() <span class="hljs-comment"># Or: <S>.agg(lambda <S>: <el>)</span> +<S> = <S>.rank/diff/cumsum/ffill/interpol…() <span class="hljs-comment"># Or: <S>.agg/transform(lambda <S>: <S>)</span> +<S> = <S>.isna/fillna/isin([<el/coll>]) <span class="hljs-comment"># Or: <S>.agg/transform/map(lambda <el>: <el>)</span> </code></pre></div> -<pre><code class="python language-python hljs"><span class="hljs-meta">>>> </span>sr = pd.Series([<span class="hljs-number">2</span>, <span class="hljs-number">3</span>], index=[<span class="hljs-string">'x'</span>, <span class="hljs-string">'y'</span>]); sr -x <span class="hljs-number">2</span> -y <span class="hljs-number">3</span> -</code></pre> <pre><code class="python hljs">┏━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━┓ ┃ │ <span class="hljs-string">'sum'</span> │ [<span class="hljs-string">'sum'</span>] │ {<span class="hljs-string">'s'</span>: <span class="hljs-string">'sum'</span>} ┃ ┠───────────────┼─────────────┼─────────────┼───────────────┨ -┃ sr.apply(…) │ <span class="hljs-number">5</span> │ sum <span class="hljs-number">5</span> │ s <span class="hljs-number">5</span> ┃ -┃ sr.agg(…) │ │ │ ┃ +┃ s.apply(…) │ <span class="hljs-number">3</span> │ sum <span class="hljs-number">3</span> │ s <span class="hljs-number">3</span> ┃ +┃ s.agg(…) │ │ │ ┃ ┗━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━┛ ┏━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━┓ ┃ │ <span class="hljs-string">'rank'</span> │ [<span class="hljs-string">'rank'</span>] │ {<span class="hljs-string">'r'</span>: <span class="hljs-string">'rank'</span>} ┃ ┠───────────────┼─────────────┼─────────────┼───────────────┨ -┃ sr.apply(…) │ │ rank │ ┃ -┃ sr.agg(…) │ x <span class="hljs-number">1</span> │ x <span class="hljs-number">1</span> │ r x <span class="hljs-number">1</span> ┃ -┃ │ y <span class="hljs-number">2</span> │ y <span class="hljs-number">2</span> │ y <span class="hljs-number">2</span> ┃ +┃ s.apply(…) │ │ rank │ ┃ +┃ s.agg(…) │ x <span class="hljs-number">1.0</span> │ x <span class="hljs-number">1.0</span> │ r x <span class="hljs-number">1.0</span> ┃ +┃ │ y <span class="hljs-number">2.0</span> │ y <span class="hljs-number">2.0</span> │ y <span class="hljs-number">2.0</span> ┃ ┗━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━┛ </code></pre> <ul> -<li><strong>Indexing objects can't be tuples because <code class="python hljs"><span class="hljs-string">'obj[x, y]'</span></code> is converted to <code class="python hljs"><span class="hljs-string">'obj[(x, y)]'</span></code>!</strong></li> <li><strong>Methods ffill(), interpolate(), fillna() and dropna() accept <code class="python hljs"><span class="hljs-string">'inplace=True'</span></code>.</strong></li> -<li><strong>Last result has a hierarchical index. Use <code class="python hljs"><span class="hljs-string">'<Sr>[key_1, key_2]'</span></code> to get its values.</strong></li> +<li><strong>Last result has a multi-index. Use <code class="python hljs"><span class="hljs-string">'<S>[key_1, key_2]'</span></code> to get its values.</strong></li> </ul> <div><h3 id="dataframe">DataFrame</h3><p><strong>Table with labeled rows and columns.</strong></p><pre><code class="python language-python hljs"><span class="hljs-meta">>>> </span>l = pd.DataFrame([[<span class="hljs-number">1</span>, <span class="hljs-number">2</span>], [<span class="hljs-number">3</span>, <span class="hljs-number">4</span>]], index=[<span class="hljs-string">'a'</span>, <span class="hljs-string">'b'</span>], columns=[<span class="hljs-string">'x'</span>, <span class="hljs-string">'y'</span>]); l x y @@ -2638,25 +2638,29 @@ b <span class="hljs-number">3</span> <span class="hljs-number">4</span> </code></pre></div> -<pre><code class="python language-python hljs"><DF> = pd.DataFrame(<list_of_rows>) <span class="hljs-comment"># Rows can be either lists, dicts or series.</span> -<DF> = pd.DataFrame(<dict_of_columns>) <span class="hljs-comment"># Columns can be either lists, dicts or series.</span> +<pre><code class="python language-python hljs"><DF> = pd.DataFrame(<list_of_rows>) <span class="hljs-comment"># Rows can be either lists, dicts or series.</span> +<DF> = pd.DataFrame(<dict_of_columns>) <span class="hljs-comment"># Columns can be either lists, dicts or series.</span> </code></pre> -<pre><code class="python language-python hljs"><el> = <DF>.loc[row_key, col_key] <span class="hljs-comment"># Or: <DF>.iloc[row_i, col_i]</span> -<Sr/DF> = <DF>.loc[row_key/s] <span class="hljs-comment"># Or: <DF>.iloc[row_i/s]</span> -<Sr/DF> = <DF>.loc[:, col_key/s] <span class="hljs-comment"># Or: <DF>.iloc[:, col_i/s]</span> -<DF> = <DF>.loc[row_bools, col_bools] <span class="hljs-comment"># Or: <DF>.iloc[row_bools, col_bools]</span> +<pre><code class="python language-python hljs"><el> = <DF>.loc[row_key, col_key] <span class="hljs-comment"># Or: <DF>.iloc[row_i, col_i]</span> +<S/DF> = <DF>.loc[row_key/s] <span class="hljs-comment"># Or: <DF>.iloc[row_i/s]</span> +<S/DF> = <DF>.loc[:, col_key/s] <span class="hljs-comment"># Or: <DF>.iloc[:, col_i/s]</span> +<DF> = <DF>.loc[row_bools, col_bools] <span class="hljs-comment"># Or: <DF>.iloc[row_bools, col_bools]</span> </code></pre> -<pre><code class="python language-python hljs"><Sr/DF> = <DF>[col_key/s] <span class="hljs-comment"># Or: <DF>.<col_key></span> -<DF> = <DF>[row_bools] <span class="hljs-comment"># Keeps rows as specified by bools.</span> -<DF> = <DF>[<DF_of_bools>] <span class="hljs-comment"># Assigns NaN to items that are False in bools.</span> +<pre><code class="python language-python hljs"><S/DF> = <DF>[col_key/s] <span class="hljs-comment"># Or: <DF>.<col_key></span> +<DF> = <DF>[row_bools] <span class="hljs-comment"># Keeps rows as specified by bools.</span> +<DF> = <DF>[<DF_of_bools>] <span class="hljs-comment"># Assigns NaN to items that are False in bools.</span> </code></pre> -<pre><code class="python language-python hljs"><DF> = <DF> > <el/Sr/DF> <span class="hljs-comment"># Returns DF of bools. Sr is treated as a row.</span> -<DF> = <DF> + <el/Sr/DF> <span class="hljs-comment"># Items with non-matching keys get value NaN.</span> +<pre><code class="python language-python hljs"><DF> = <DF> > <el/S/DF> <span class="hljs-comment"># Returns DF of bools. S is treated as a row.</span> +<DF> = <DF> + <el/S/DF> <span class="hljs-comment"># Items with non-matching keys get value NaN.</span> </code></pre> -<pre><code class="python language-python hljs"><DF> = <DF>.set_index(col_key) <span class="hljs-comment"># Replaces row keys with column's values.</span> -<DF> = <DF>.reset_index(drop=<span class="hljs-keyword">False</span>) <span class="hljs-comment"># Drops or moves row keys to column named index.</span> -<DF> = <DF>.sort_index(ascending=<span class="hljs-keyword">True</span>) <span class="hljs-comment"># Sorts rows by row keys. Use `axis=1` for cols.</span> -<DF> = <DF>.sort_values(col_key/s) <span class="hljs-comment"># Sorts rows by passed column/s. Also `axis=1`.</span> +<pre><code class="python language-python hljs"><DF> = <DF>.set_index(col_key) <span class="hljs-comment"># Replaces row keys with column's values.</span> +<DF> = <DF>.reset_index(drop=<span class="hljs-keyword">False</span>) <span class="hljs-comment"># Drops or moves row keys to column named index.</span> +<DF> = <DF>.sort_index(ascending=<span class="hljs-keyword">True</span>) <span class="hljs-comment"># Sorts rows by row keys. Use `axis=1` for cols.</span> +<DF> = <DF>.sort_values(col_key/s) <span class="hljs-comment"># Sorts rows by passed column/s. Also `axis=1`.</span> +</code></pre> +<pre><code class="python language-python hljs"><DF> = <DF>.head/tail/sample(<int>) <span class="hljs-comment"># Returns first, last, or random n elements.</span> +<DF> = <DF>.describe() <span class="hljs-comment"># Describes columns. Also shape, info(), corr().</span> +<DF> = <DF>.query(<span class="hljs-string">'<query>'</span>) <span class="hljs-comment"># Filters rows with e.g. 'col_1 == val_1 and …'.</span> </code></pre> <pre><code class="python language-python hljs"><DF>.plot.line/area/bar/scatter(x=col_key, …) <span class="hljs-comment"># `y=col_key/s`. Also hist/box(by=col_key).</span> plt.show() <span class="hljs-comment"># Displays the plot. Also plt.savefig(<path>).</span> @@ -2684,52 +2688,47 @@ c <span class="hljs-number">6</span> <span class="hljs-number">7</span> ┃ axis=<span class="hljs-number">0</span>, │ a <span class="hljs-number">1</span> <span class="hljs-number">2</span> . │ <span class="hljs-number">2</span> │ │ Uses <span class="hljs-string">'outer'</span> by default. ┃ ┃ join=…) │ b <span class="hljs-number">3</span> <span class="hljs-number">4</span> . │ <span class="hljs-number">4</span> │ │ A Series is treated as a ┃ ┃ │ b . <span class="hljs-number">4</span> <span class="hljs-number">5</span> │ <span class="hljs-number">4</span> │ │ column. To add a row use ┃ -┃ │ c . <span class="hljs-number">6</span> <span class="hljs-number">7</span> │ <span class="hljs-number">6</span> │ │ pd.concat([l, DF([sr])]).┃ +┃ │ c . <span class="hljs-number">6</span> <span class="hljs-number">7</span> │ <span class="hljs-number">6</span> │ │ pd.concat([l, DF([s])]). ┃ ┠────────────────────────┼───────────────┼────────────┼────────────┼──────────────────────────┨ ┃ pd.concat([l, r], │ x y y z │ │ │ Adds columns at the ┃ ┃ axis=<span class="hljs-number">1</span>, │ a <span class="hljs-number">1</span> <span class="hljs-number">2</span> . . │ x y y z │ │ right end. Uses <span class="hljs-string">'outer'</span> ┃ ┃ join=…) │ b <span class="hljs-number">3</span> <span class="hljs-number">4</span> <span class="hljs-number">4</span> <span class="hljs-number">5</span> │ <span class="hljs-number">3</span> <span class="hljs-number">4</span> <span class="hljs-number">4</span> <span class="hljs-number">5</span> │ │ by default. A Series is ┃ ┃ │ c . . <span class="hljs-number">6</span> <span class="hljs-number">7</span> │ │ │ treated as a column. ┃ -┠────────────────────────┼───────────────┼────────────┼────────────┼──────────────────────────┨ -┃ l.combine_first(r) │ x y z │ │ │ Adds missing rows and ┃ -┃ │ a <span class="hljs-number">1</span> <span class="hljs-number">2</span> . │ │ │ columns. Also updates ┃ -┃ │ b <span class="hljs-number">3</span> <span class="hljs-number">4</span> <span class="hljs-number">5</span> │ │ │ items that contain NaN. ┃ -┃ │ c . <span class="hljs-number">6</span> <span class="hljs-number">7</span> │ │ │ Argument r must be a DF. ┃ ┗━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━┷━━━━━━━━━━━━┷━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━┛ </code></pre> -<div><h4 id="dataframeaggregatetransformmap">DataFrame — Aggregate, Transform, Map:</h4><pre><code class="python language-python hljs"><Sr> = <DF>.sum/max/mean/idxmax/all() <span class="hljs-comment"># Or: <DF>.apply/agg(lambda <Sr>: <el>)</span> -<DF> = <DF>.rank/diff/cumsum/ffill/interpo…() <span class="hljs-comment"># Or: <DF>.apply/agg/transfo…(lambda <Sr>: <Sr>)</span> -<DF> = <DF>.fillna(<el>) <span class="hljs-comment"># Or: <DF>.applymap(lambda <el>: <el>)</span> +<div><h4 id="dataframeaggregatetransformmap">DataFrame — Aggregate, Transform, Map:</h4><pre><code class="python language-python hljs"><S> = <DF>.sum/max/mean/idxmax/all() <span class="hljs-comment"># Or: <DF>.apply/agg(lambda <S>: <el>)</span> +<DF> = <DF>.rank/diff/cumsum/ffill/interpo…() <span class="hljs-comment"># Or: <DF>.apply/agg/transform(lambda <S>: <S>)</span> +<DF> = <DF>.isna/fillna/isin([<el/coll>]) <span class="hljs-comment"># Or: <S>.agg/transform/map(lambda <el>: <el>)</span> </code></pre></div> -<ul> -<li><strong>All operations operate on columns by default. Pass <code class="python hljs"><span class="hljs-string">'axis=1'</span></code> to process the rows instead.</strong></li> -</ul> -<pre><code class="python language-python hljs"><span class="hljs-meta">>>> </span>df = pd.DataFrame([[<span class="hljs-number">1</span>, <span class="hljs-number">2</span>], [<span class="hljs-number">3</span>, <span class="hljs-number">4</span>]], index=[<span class="hljs-string">'a'</span>, <span class="hljs-string">'b'</span>], columns=[<span class="hljs-string">'x'</span>, <span class="hljs-string">'y'</span>]); df - x y -a <span class="hljs-number">1</span> <span class="hljs-number">2</span> -b <span class="hljs-number">3</span> <span class="hljs-number">4</span> -</code></pre> <pre><code class="python hljs">┏━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━┓ ┃ │ <span class="hljs-string">'sum'</span> │ [<span class="hljs-string">'sum'</span>] │ {<span class="hljs-string">'x'</span>: <span class="hljs-string">'sum'</span>} ┃ ┠─────────────────┼─────────────┼─────────────┼───────────────┨ -┃ df.apply(…) │ x <span class="hljs-number">4</span> │ x y │ x <span class="hljs-number">4</span> ┃ -┃ df.agg(…) │ y <span class="hljs-number">6</span> │ sum <span class="hljs-number">4</span> <span class="hljs-number">6</span> │ ┃ +┃ l.apply(…) │ x <span class="hljs-number">4</span> │ x y │ x <span class="hljs-number">4</span> ┃ +┃ l.agg(…) │ y <span class="hljs-number">6</span> │ sum <span class="hljs-number">4</span> <span class="hljs-number">6</span> │ ┃ ┗━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━┛ ┏━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━┓ ┃ │ <span class="hljs-string">'rank'</span> │ [<span class="hljs-string">'rank'</span>] │ {<span class="hljs-string">'x'</span>: <span class="hljs-string">'rank'</span>} ┃ ┠─────────────────┼─────────────┼─────────────┼───────────────┨ -┃ df.apply(…) │ │ x y │ ┃ -┃ df.agg(…) │ x y │ rank rank │ x ┃ -┃ df.transform(…) │ a <span class="hljs-number">1</span> <span class="hljs-number">1</span> │ a <span class="hljs-number">1</span> <span class="hljs-number">1</span> │ a <span class="hljs-number">1</span> ┃ -┃ │ b <span class="hljs-number">2</span> <span class="hljs-number">2</span> │ b <span class="hljs-number">2</span> <span class="hljs-number">2</span> │ b <span class="hljs-number">2</span> ┃ +┃ l.apply(…) │ │ x y │ ┃ +┃ l.agg(…) │ x y │ rank rank │ x ┃ +┃ l.transform(…) │ a <span class="hljs-number">1.0</span> <span class="hljs-number">1.0</span> │ a <span class="hljs-number">1.0</span> <span class="hljs-number">1.0</span> │ a <span class="hljs-number">1.0</span> ┃ +┃ │ b <span class="hljs-number">2.0</span> <span class="hljs-number">2.0</span> │ b <span class="hljs-number">2.0</span> <span class="hljs-number">2.0</span> │ b <span class="hljs-number">2.0</span> ┃ ┗━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━┛ </code></pre> <ul> -<li><strong>Use <code class="python hljs"><span class="hljs-string">'<DF>[col_key_1, col_key_2][row_key]'</span></code> to get the fifth result's values.</strong></li> +<li><strong>All methods operate on columns by default. Pass <code class="python hljs"><span class="hljs-string">'axis=1'</span></code> to process the rows instead.</strong></li> +<li><strong>Fifth result's columns are indexed with a multi-index. This means we need a tuple of column keys to specify a single column: <code class="python hljs"><span class="hljs-string">'<DF>.loc[row_k, (col_k_1, col_k_2)]'</span></code>.</strong></li> </ul> +<div><h4 id="dataframemultiindex">DataFrame — Multi-Index:</h4><pre><code class="python language-python hljs"><DF> = <DF>.xs(row_key, level=<int>) <span class="hljs-comment"># Rows with key on passed level of multi-index.</span> +<DF> = <DF>.xs(row_keys, level=<ints>) <span class="hljs-comment"># Rows that have first key on first level, etc.</span> +<DF> = <DF>.set_index(col_keys) <span class="hljs-comment"># Combines multiple columns into a multi-index.</span> +<S/DF> = <DF>.stack/unstack(level=<span class="hljs-number">-1</span>) <span class="hljs-comment"># Combines col keys with row keys or vice versa.</span> +<DF> = <DF>.pivot_table(index=col_key/s, …) <span class="hljs-comment"># `columns=col_key/s, values=col_key/s`.</span> +</code></pre></div> + <div><h4 id="dataframeencodedecode">DataFrame — Encode, Decode:</h4><pre><code class="python language-python hljs"><DF> = pd.read_json/html(<span class="hljs-string">'<str/path/url>'</span>) <span class="hljs-comment"># Run `$ pip3 install beautifulsoup4 lxml`.</span> <DF> = pd.read_csv(<span class="hljs-string">'<path/url>'</span>) <span class="hljs-comment"># `header/index_col/dtype/usecols/…=<obj>`.</span> <DF> = pd.read_pickle/excel(<span class="hljs-string">'<path/url>'</span>) <span class="hljs-comment"># Use `sheet_name=None` to get all Excel sheets.</span> @@ -2743,41 +2742,37 @@ b <span class="hljs-number">3</span> <span class="hljs-number">4</span> </code></pre> <ul> <li><strong>Read_csv() only parses dates of columns that were specified by 'parse_dates' argument. It automatically tries to detect the format, but it can be helped with 'date_format' or 'datefirst' arguments. Both dates and datetimes get stored as pd.Timestamp objects.</strong></li> -<li><strong>If there's a single invalid date then it returns the whole column as a series of strings, unlike <code class="python hljs"><span class="hljs-string">'<Sr> = pd.to_datetime(<Sr>, errors="coerce")'</span></code>, which uses pd.NaT.</strong></li> -<li><strong>To get specific attributes from a series of Timestamps use <code class="python hljs"><span class="hljs-string">'<Sr>.dt.year/date/…'</span></code>.</strong></li> +<li><strong>If there's a single invalid date then it returns the whole column as a series of strings, unlike <code class="python hljs"><span class="hljs-string">'<S> = pd.to_datetime(<S>, errors="coerce")'</span></code>, which uses pd.NaT.</strong></li> +<li><strong>To get specific attributes from a series of Timestamps use <code class="python hljs"><span class="hljs-string">'<S>.dt.year/date/…'</span></code>.</strong></li> </ul> -<div><h3 id="groupby">GroupBy</h3><p><strong>Object that groups together rows of a dataframe based on the value of the passed column.</strong></p><pre><code class="python language-python hljs"><span class="hljs-meta">>>> </span>df = pd.DataFrame([[<span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">3</span>], [<span class="hljs-number">4</span>, <span class="hljs-number">5</span>, <span class="hljs-number">6</span>], [<span class="hljs-number">7</span>, <span class="hljs-number">8</span>, <span class="hljs-number">6</span>]], list(<span class="hljs-string">'abc'</span>), list(<span class="hljs-string">'xyz'</span>)) +<div><h3 id="groupby">GroupBy</h3><p><strong>Object that groups together rows of a dataframe based on the value of the passed column.</strong></p><pre><code class="python language-python hljs"><GB> = <DF>.groupby(col_key/s) <span class="hljs-comment"># Splits DF into groups based on passed column.</span> +<DF> = <GB>.apply(<func>) <span class="hljs-comment"># Maps each group. Func can return DF, S or el.</span> +<DF> = <GB>.get_group(<el>) <span class="hljs-comment"># Selects a group by grouping column's value.</span> +<S> = <GB>.size() <span class="hljs-comment"># S of group sizes. Same keys as get_group().</span> +<GB> = <GB>[col_key] <span class="hljs-comment"># Single column GB. All operations return S.</span> +</code></pre></div> + + +<pre><code class="python language-python hljs"><DF> = <GB>.sum/max/mean/idxmax/all() <span class="hljs-comment"># Or: <GB>.agg(lambda <S>: <el>)</span> +<DF> = <GB>.rank/diff/cumsum/ffill() <span class="hljs-comment"># Or: <GB>.transform(lambda <S>: <S>)</span> +<DF> = <GB>.fillna(<el>) <span class="hljs-comment"># Or: <GB>.transform(lambda <S>: <S>)</span> +</code></pre> +<div><h4 id="dividesrowsintogroupsandsumstheircolumnsresulthasanamedindexthatcreatescolumnzonreset_index">Divides rows into groups and sums their columns. Result has a named index that creates column <code class="python hljs"><span class="hljs-string">'z'</span></code> on reset_index():</h4><pre><code class="python language-python hljs"><span class="hljs-meta">>>> </span>df = pd.DataFrame([[<span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">3</span>], [<span class="hljs-number">4</span>, <span class="hljs-number">5</span>, <span class="hljs-number">6</span>], [<span class="hljs-number">7</span>, <span class="hljs-number">8</span>, <span class="hljs-number">6</span>]], list(<span class="hljs-string">'abc'</span>), list(<span class="hljs-string">'xyz'</span>)) <span class="hljs-meta">>>> </span>gb = df.groupby(<span class="hljs-string">'z'</span>); gb.apply(print) x y z a <span class="hljs-number">1</span> <span class="hljs-number">2</span> <span class="hljs-number">3</span> x y z b <span class="hljs-number">4</span> <span class="hljs-number">5</span> <span class="hljs-number">6</span> -c <span class="hljs-number">7</span> <span class="hljs-number">8</span> <span class="hljs-number">6</span></code></pre></div> - - -<pre><code class="python language-python hljs"><GB> = <DF>.groupby(col_key/s) <span class="hljs-comment"># Splits DF into groups based on passed column.</span> -<DF> = <GB>.apply(<func>) <span class="hljs-comment"># Maps each group. Func can return DF, Sr or el.</span> -<DF> = <GB>.get_group(<el>) <span class="hljs-comment"># Selects a group by grouping column's value.</span> -<Sr> = <GB>.size() <span class="hljs-comment"># A Sr of group sizes. Same keys as get_group().</span> -<GB> = <GB>[col_key] <span class="hljs-comment"># Single column GB. All operations return a Sr.</span> -</code></pre> -<div><h4 id="groupbyaggregatetransformmap">GroupBy — Aggregate, Transform, Map:</h4><pre><code class="python language-python hljs"><DF> = <GB>.sum/max/mean/idxmax/all() <span class="hljs-comment"># Or: <GB>.agg(lambda <Sr>: <el>)</span> -<DF> = <GB>.rank/diff/cumsum/ffill() <span class="hljs-comment"># Or: <GB>.transform(lambda <Sr>: <Sr>)</span> -<DF> = <GB>.fillna(<el>) <span class="hljs-comment"># Or: <GB>.transform(lambda <Sr>: <Sr>)</span> -</code></pre></div> - -<pre><code class="python language-python hljs"><span class="hljs-meta">>>> </span>gb.sum() +c <span class="hljs-number">7</span> <span class="hljs-number">8</span> <span class="hljs-number">6</span> +<span class="hljs-meta">>>> </span>gb.sum() x y z <span class="hljs-number">3</span> <span class="hljs-number">1</span> <span class="hljs-number">2</span> -<span class="hljs-number">6</span> <span class="hljs-number">11</span> <span class="hljs-number">13</span> -</code></pre> -<ul> -<li><strong>Result has a named index that creates column <code class="python hljs"><span class="hljs-string">'z'</span></code> instead of <code class="python hljs"><span class="hljs-string">'index'</span></code> on reset_index().</strong></li> -</ul> -<div><h3 id="rolling">Rolling</h3><p><strong>Object for rolling window calculations.</strong></p><pre><code class="python language-python hljs"><RSr/RDF/RGB> = <Sr/DF/GB>.rolling(win_size) <span class="hljs-comment"># Also: `min_periods=None, center=False`.</span> -<RSr/RDF/RGB> = <RDF/RGB>[col_key/s] <span class="hljs-comment"># Or: <RDF/RGB>.col_key</span> -<Sr/DF> = <R>.mean/sum/max() <span class="hljs-comment"># Or: <R>.apply/agg(<agg_func/str>)</span> +<span class="hljs-number">6</span> <span class="hljs-number">11</span> <span class="hljs-number">13</span></code></pre></div> + +<div><h3 id="rolling">Rolling</h3><p><strong>Object for rolling window calculations.</strong></p><pre><code class="python language-python hljs"><RS/RDF/RGB> = <S/DF/GB>.rolling(win_size) <span class="hljs-comment"># Also: `min_periods=None, center=False`.</span> +<RS/RDF/RGB> = <RDF/RGB>[col_key/s] <span class="hljs-comment"># Or: <RDF/RGB>.col_key</span> +<S/DF> = <R>.mean/sum/max() <span class="hljs-comment"># Or: <R>.apply/agg(<agg_func/str>)</span> </code></pre></div> @@ -2928,7 +2923,7 @@ $ deactivate <span class="hljs-comment"># Deactivates the active <footer> - <aside>October 28, 2024</aside> + <aside>November 7, 2024</aside> <a href="https://gto76.github.io" rel="author">Jure Šorn</a> </footer> diff --git a/parse.js b/parse.js index bb68554..51b7ee2 100755 --- a/parse.js +++ b/parse.js @@ -316,7 +316,13 @@ const GROUPBY = 'a <span class="hljs-number">1</span> <span class="hljs-number">2</span> <span class="hljs-number">3</span>\n' + ' x y z\n' + 'b <span class="hljs-number">4</span> <span class="hljs-number">5</span> <span class="hljs-number">6</span>\n' + - 'c <span class="hljs-number">7</span> <span class="hljs-number">8</span> <span class="hljs-number">6</span>'; + 'c <span class="hljs-number">7</span> <span class="hljs-number">8</span> <span class="hljs-number">6</span>\n' + + '<span class="hljs-meta">>>> </span>gb.sum()\n' + + ' x y\n' + + 'z\n' + + '<span class="hljs-number">3</span> <span class="hljs-number">1</span> <span class="hljs-number">2</span>\n' + + '<span class="hljs-number">6</span> <span class="hljs-number">11</span> <span class="hljs-number">13</span>'; + const CYTHON_1 = '<span class="hljs-keyword">cdef</span> <ctype> <var_name> = <obj>\n' + @@ -576,22 +582,22 @@ const DIAGRAM_12_B = '┗━━━━━━━━━━━┷━━━━━━━━━━━┷━━━━━━┷━━━━━━━━━━━┛\n'; const DIAGRAM_13_A = - '| sr.apply(…) | 5 | sum 5 | s 5 |'; + '| s.apply(…) | 3 | sum 3 | s 3 |'; const DIAGRAM_13_B = "┏━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━┓\n" + "┃ │ 'sum' │ ['sum'] │ {'s': 'sum'} ┃\n" + "┠───────────────┼─────────────┼─────────────┼───────────────┨\n" + - "┃ sr.apply(…) │ 5 │ sum 5 │ s 5 ┃\n" + - "┃ sr.agg(…) │ │ │ ┃\n" + + "┃ s.apply(…) │ 3 │ sum 3 │ s 3 ┃\n" + + "┃ s.agg(…) │ │ │ ┃\n" + "┗━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━┛\n" + "\n" + "┏━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━┓\n" + "┃ │ 'rank' │ ['rank'] │ {'r': 'rank'} ┃\n" + "┠───────────────┼─────────────┼─────────────┼───────────────┨\n" + - "┃ sr.apply(…) │ │ rank │ ┃\n" + - "┃ sr.agg(…) │ x 1 │ x 1 │ r x 1 ┃\n" + - "┃ │ y 2 │ y 2 │ y 2 ┃\n" + + "┃ s.apply(…) │ │ rank │ ┃\n" + + "┃ s.agg(…) │ x 1.0 │ x 1.0 │ r x 1.0 ┃\n" + + "┃ │ y 2.0 │ y 2.0 │ y 2.0 ┃\n" + "┗━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━┛\n"; const DIAGRAM_14_A = @@ -618,37 +624,32 @@ const DIAGRAM_15_B = "┃ axis=0, │ a 1 2 . │ 2 │ │ Uses 'outer' by default. ┃\n" + "┃ join=…) │ b 3 4 . │ 4 │ │ A Series is treated as a ┃\n" + "┃ │ b . 4 5 │ 4 │ │ column. To add a row use ┃\n" + - "┃ │ c . 6 7 │ 6 │ │ pd.concat([l, DF([sr])]).┃\n" + + "┃ │ c . 6 7 │ 6 │ │ pd.concat([l, DF([s])]). ┃\n" + "┠────────────────────────┼───────────────┼────────────┼────────────┼──────────────────────────┨\n" + "┃ pd.concat([l, r], │ x y y z │ │ │ Adds columns at the ┃\n" + "┃ axis=1, │ a 1 2 . . │ x y y z │ │ right end. Uses 'outer' ┃\n" + "┃ join=…) │ b 3 4 4 5 │ 3 4 4 5 │ │ by default. A Series is ┃\n" + "┃ │ c . . 6 7 │ │ │ treated as a column. ┃\n" + - "┠────────────────────────┼───────────────┼────────────┼────────────┼──────────────────────────┨\n" + - "┃ l.combine_first(r) │ x y z │ │ │ Adds missing rows and ┃\n" + - "┃ │ a 1 2 . │ │ │ columns. Also updates ┃\n" + - "┃ │ b 3 4 5 │ │ │ items that contain NaN. ┃\n" + - "┃ │ c . 6 7 │ │ │ Argument r must be a DF. ┃\n" + "┗━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━┷━━━━━━━━━━━━┷━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━┛\n"; const DIAGRAM_16_A = - '| df.apply(…) | x 4 | x y | x 4 |'; + '| l.apply(…) | x 4 | x y | x 4 |'; const DIAGRAM_16_B = "┏━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━┓\n" + "┃ │ 'sum' │ ['sum'] │ {'x': 'sum'} ┃\n" + "┠─────────────────┼─────────────┼─────────────┼───────────────┨\n" + - "┃ df.apply(…) │ x 4 │ x y │ x 4 ┃\n" + - "┃ df.agg(…) │ y 6 │ sum 4 6 │ ┃\n" + + "┃ l.apply(…) │ x 4 │ x y │ x 4 ┃\n" + + "┃ l.agg(…) │ y 6 │ sum 4 6 │ ┃\n" + "┗━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━┛\n" + "\n" + "┏━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━┓\n" + "┃ │ 'rank' │ ['rank'] │ {'x': 'rank'} ┃\n" + "┠─────────────────┼─────────────┼─────────────┼───────────────┨\n" + - "┃ df.apply(…) │ │ x y │ ┃\n" + - "┃ df.agg(…) │ x y │ rank rank │ x ┃\n" + - "┃ df.transform(…) │ a 1 1 │ a 1 1 │ a 1 ┃\n" + - "┃ │ b 2 2 │ b 2 2 │ b 2 ┃\n" + + "┃ l.apply(…) │ │ x y │ ┃\n" + + "┃ l.agg(…) │ x y │ rank rank │ x ┃\n" + + "┃ l.transform(…) │ a 1.0 1.0 │ a 1.0 1.0 │ a 1.0 ┃\n" + + "┃ │ b 2.0 2.0 │ b 2.0 2.0 │ b 2.0 ┃\n" + "┗━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━┛\n"; const DIAGRAM_17_A =