{ "cells": [ { "cell_type": "markdown", "id": "b574dd76-d6f2-4194-8edc-f18fa5eb30b0", "metadata": {}, "source": [ "# Parte 3: Transformación de tablas" ] }, { "cell_type": "code", "execution_count": null, "id": "1f6a501e-1f1c-4968-9f82-27645694d6bf", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "pd.options.display.float_format = '{:.2f}'.format #Desactivar notación científica en pandas:\n", "np.set_printoptions(suppress=True) #Desactivar notación científica en numpy:\n", "pd.set_option('display.max_columns', None) #comando para mostrar todas las columnas" ] }, { "cell_type": "code", "execution_count": null, "id": "5a9962a3-799a-4c68-8389-0b404811ca83", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('./data/transacciones2.csv', sep=';', decimal=',')" ] }, { "cell_type": "code", "execution_count": null, "id": "fac280fb-c71c-46ee-9995-a30ff9a8c9e4", "metadata": {}, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "id": "9c2e2fa7-6477-4987-9c06-039d60c70103", "metadata": {}, "outputs": [], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "01a429ef-f03e-48a7-aaf8-3d28b72970b8", "metadata": {}, "outputs": [], "source": [ "df.Fecha = pd.to_datetime(df.Fecha, errors='coerce')" ] }, { "cell_type": "code", "execution_count": null, "id": "7f57f2e7-ee32-4fb8-935c-17a47eccbfb6", "metadata": {}, "outputs": [], "source": [ "df['mes'] = df.Fecha.dt.month" ] }, { "cell_type": "code", "execution_count": null, "id": "1946c46f-3e44-4eb9-b520-a811b4aecff2", "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "16d66a6f-97c6-46c7-a6bb-697983779cac", "metadata": {}, "outputs": [], "source": [ "# Comprobamos la existencia (o no) de registros duplicados" ] }, { "cell_type": "code", "execution_count": null, "id": "925af3d4-eda4-4cbc-809b-1e5d6d4f61d8", "metadata": {}, "outputs": [], "source": [ "filtro = df.duplicated(keep='first') # Booleano que podemos usar para filtrar los duplicados\n", "filtro" ] }, { "cell_type": "code", "execution_count": null, "id": "88be3352-5418-40f2-9d30-4e5ed9579980", "metadata": {}, "outputs": [], "source": [ "df.duplicated(keep='first').sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "e7fc98b2-2551-4257-953f-57fa249449c4", "metadata": {}, "outputs": [], "source": [ "# Podemos eliminar duplicados filtrando con filtro o a través de .drop_duplicates\n", "df2 = df.drop_duplicates(keep='first')\n", "print(df2.shape)\n", "print(df.shape)\n", "print (df.shape[0]-df2.shape[0]) # Vemos que en ambos casos, el total de registros duplicados es el mismo." ] }, { "cell_type": "code", "execution_count": null, "id": "b8b1c38c-b3c1-4056-82a9-080351819dfc", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Tipos de comercios únicos\n", "df2['Tipo de Comercio'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "75e9a611-fc7b-4c80-8d60-5ade311e558b", "metadata": {}, "outputs": [], "source": [ "# Pequeño inciso. Simplificamos el formato del tipo de comercio. Nos sobra es_\n", "df2['Tipo de Comercio'] = df2['Tipo de Comercio'].str.replace('es_','')" ] }, { "cell_type": "code", "execution_count": null, "id": "8343dd20-fc4b-4338-82a3-4fc823f720ae", "metadata": {}, "outputs": [], "source": [ "df2['Tipo de Comercio'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "4f38cc5a-6648-4e3d-9e7a-bb9e7833aa12", "metadata": {}, "outputs": [], "source": [ "# Además simplificamos los nombres de columna\n", "df2.columns = ['fecha','cp','comercio','transacciones','importe','importetotal','mes']" ] }, { "cell_type": "code", "execution_count": null, "id": "76d12578-debc-4fc5-867f-51a885aa8871", "metadata": {}, "outputs": [], "source": [ "# Aprovechamos que value_counts() nos hace el trabajo, y filtramos por los 5 primeros comercios\n", "tmp = df2['comercio'].value_counts()\n", "filtro = tmp.index [0:5].values # Los tipos de negocio son el índice del array\n", "filtro" ] }, { "cell_type": "markdown", "id": "070a3309-8a30-4048-a19b-45bb2ea070a6", "metadata": {}, "source": [ "# Probad a cambiar los nombres de estos 5 tipos de negocio al castellano, usando la función de mapeo vista anteriormente." ] }, { "cell_type": "code", "execution_count": null, "id": "9be55d46-c4df-4892-8675-add0ee4b2a28", "metadata": {}, "outputs": [], "source": [ "# Código aquí\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b0e63a6e-c3fc-4ca7-9902-f6cc08c1463c", "metadata": { "tags": [] }, "outputs": [], "source": [ "df3 = df2 [df2['comercio'].isin(filtro)]" ] }, { "cell_type": "code", "execution_count": null, "id": "567f4665-e7ef-4847-81b9-1a58a9d46593", "metadata": {}, "outputs": [], "source": [ "df3['comercio'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "49c0bb65-5d40-4303-911f-0efe4b5ecf18", "metadata": {}, "outputs": [], "source": [ "# Guardammos el resultado\n", "df3.to_csv('./data/transacciones3.csv', sep=';', decimal=',', encoding='UTF-8')" ] }, { "cell_type": "markdown", "id": "c905363b-4486-458d-bf94-aad6b7811977", "metadata": {}, "source": [ "## Unas agregaciones" ] }, { "cell_type": "code", "execution_count": null, "id": "4ab2d674-a1ae-4447-9773-524d4e97ec20", "metadata": {}, "outputs": [], "source": [ "# Importe total por tipo de negocio\n", "agregacion1 = pd.pivot_table(data = df3, columns='comercio', values='importetotal', aggfunc=sum).transpose()\n", "agregacion1" ] }, { "cell_type": "code", "execution_count": null, "id": "e18b3bf0-b03e-45e4-b221-cef6d98f2f20", "metadata": {}, "outputs": [], "source": [ "# Agregaciones en base a 2 valores. Extraemos por tipo de comercio, la suma de ventas y la suma de transacciones \n", "agregacion2 = pd.pivot_table(data = df3, columns='comercio', values = ['importetotal', 'transacciones'], aggfunc='sum').transpose()\n", "agregacion2" ] }, { "cell_type": "code", "execution_count": null, "id": "7cc35474-3c2b-4348-a031-80faa38e6214", "metadata": {}, "outputs": [], "source": [ "# Un pelín más complicado\n", "agsum = pd.pivot_table(data=df3, columns=['comercio','mes','cp'], values=['importetotal','transacciones'], aggfunc='sum').transpose()\n", "agsum" ] }, { "cell_type": "code", "execution_count": null, "id": "bac4ab97-f32c-4e4d-b368-304c4a414224", "metadata": {}, "outputs": [], "source": [ "# Usamos otras funciones de agregación\n", "agmean = pd.pivot_table(data=df3, columns=['comercio','mes','cp'], values=['importetotal','transacciones'], aggfunc='mean').transpose()" ] }, { "cell_type": "markdown", "id": "1dc38964-bceb-4c12-9107-6461b92ddf0f", "metadata": {}, "source": [ "# Probad vosotros a hacer una agregación donde se muestre:\n", "## * Total transacciones por código postal\n", "## * Total transacciones por código postal y mes\n", "## * Transacciones máximas por código postal y tipo de comercio" ] }, { "cell_type": "code", "execution_count": null, "id": "dd0c927a-bcaf-4b7a-bf29-07f9292bb4ba", "metadata": {}, "outputs": [], "source": [ "# Agregaciones en base a los percentiles\n", "def p25(g):\n", " return np.percentile(g, 25)\n", "\n", "def p75(g):\n", " return np.percentile(g, 75)\n", "\n", "agpercentiles = df3.pivot_table(\n", " columns=['comercio'], \n", " values=['importetotal'], \n", " aggfunc=[p25, np.median, p75]).transpose()" ] }, { "cell_type": "code", "execution_count": null, "id": "20bc428f-9461-4e66-8779-9cb938db6ea6", "metadata": {}, "outputs": [], "source": [ "agpercentiles" ] }, { "cell_type": "markdown", "id": "4ce9505c-7b26-4600-bfd6-1c131bd091d9", "metadata": {}, "source": [ "## Modelado de tablas" ] }, { "cell_type": "code", "execution_count": null, "id": "34b89b98-aec6-4c4b-a5c1-8858620ec10e", "metadata": {}, "outputs": [], "source": [ "df3.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "2e4cc0cb-1218-4f75-8437-76e7cd904ee3", "metadata": {}, "outputs": [], "source": [ "# Pasar datos a formato ancho\n", "pivotada = df3.pivot(index=['fecha','cp'], columns='comercio', values='importetotal')\n", "pivotada" ] }, { "cell_type": "code", "execution_count": null, "id": "35280a2a-4774-485d-aab9-ba8ccd6cf418", "metadata": {}, "outputs": [], "source": [ "# Eliminamos NA's pq en este caso implican ausencia de venta y por tanto el importe total de la transacción será 0\n", "pivotada [pivotada.isnull()] = 0\n", "pivotada\n", "\n", "# Ojo!!!!! El paso anterior funciona, pq todas las columnas son numéricas. Las de texto están como índice" ] }, { "cell_type": "code", "execution_count": null, "id": "43950261-4a50-49af-bc57-5461424a8e29", "metadata": {}, "outputs": [], "source": [ "# ¿Qué pasa si intentamos pivotar una tabla en la que tenemos varios registros para un mismo dato?\n", "df3.pivot(index=['cp'], columns='comercio', values='importetotal')" ] }, { "cell_type": "code", "execution_count": null, "id": "576d62e5-104d-46c1-a1d0-1a4227f89d4b", "metadata": {}, "outputs": [], "source": [ "# En estos casos deberemos hacer una agregación, para incluir las entradas duplicadas en una sola.\n", "# En este ejemplo, tenemos registros donde para un CP y un Comercio, hay más de un importe total. Varios negocios (o el mismo) puede vender varias veces el mismo día y por tanto.\n", "pivotadacp1 = df3.pivot_table(index=['cp'], columns='comercio', values='importetotal', aggfunc=np.sum)\n", "pivotadacp1" ] }, { "cell_type": "code", "execution_count": null, "id": "3204c842-7a72-4481-8f77-6574b78c55b1", "metadata": {}, "outputs": [], "source": [ "pivotadacp2 = df3.pivot_table(index=['cp'], columns='comercio', values='importetotal', aggfunc='count').reset_index()" ] }, { "cell_type": "code", "execution_count": null, "id": "69076655-b904-44be-b148-144a771babe8", "metadata": {}, "outputs": [], "source": [ "pivotadacp2" ] }, { "cell_type": "markdown", "id": "cbf9ac62-1c86-42fb-bf6d-c39072ade519", "metadata": {}, "source": [ "## Fundido de tablas" ] }, { "cell_type": "code", "execution_count": null, "id": "6a447a66-9091-4358-9091-c7594e0a3ed6", "metadata": {}, "outputs": [], "source": [ "melt0 = pd.melt(pivotadacp2)\n", "melt0" ] }, { "cell_type": "code", "execution_count": null, "id": "689e70d7-de66-4f6d-ae47-1337da31a82a", "metadata": {}, "outputs": [], "source": [ "# Usaremos este tipo de disposición en los datos para ejecutar test estadísticos (píntame este valor en función de esta columna, mírame si el valor de esta columna influye estadísticamente en el valor de esta otra, ...)" ] }, { "cell_type": "code", "execution_count": null, "id": "c52bb4f8-336a-4a73-81ce-3c45af84c0a5", "metadata": {}, "outputs": [], "source": [ "melt0 = pd.melt(pivotada.reset_index(), id_vars='fecha')" ] }, { "cell_type": "code", "execution_count": null, "id": "c34e3b53-9b18-4c79-a092-e244e40c5d31", "metadata": {}, "outputs": [], "source": [ "melt0" ] }, { "cell_type": "code", "execution_count": null, "id": "6305b077-d3aa-4f94-9ecf-6dd8f479ff7a", "metadata": {}, "outputs": [], "source": [ "# Fundimos a través de varios índices\n", "melt1 = pd.melt(pivotada.reset_index(), id_vars = ['fecha','cp'])\n", "melt1" ] }, { "cell_type": "code", "execution_count": null, "id": "17b32c3e-36e9-46d5-9ef3-e4aa694f449c", "metadata": {}, "outputs": [], "source": [ "melt1.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "4b85ee18-0e3a-4ad0-b59a-67808ae86f36", "metadata": {}, "outputs": [], "source": [ "# Representamos gráficamente los resultados.\n", "plt.rcParams['figure.figsize'] = (20, 20)\n", "sns.regplot(x=melt1['fecha'], y=melt1['value'], fit_reg=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "d0c15147-0f16-4716-844a-03b0e39be82f", "metadata": {}, "outputs": [], "source": [ "plt.rcParams['figure.figsize'] = (20, 20)\n", "sns.lineplot(x=melt1['fecha'], y=melt1['value'], hue=melt1['comercio'])" ] }, { "cell_type": "code", "execution_count": null, "id": "3a0137dc-9da0-47aa-be91-fbf24942ee70", "metadata": {}, "outputs": [], "source": [ "# Pintamos el resultado por código postal\n", "# Extraigo una lista ordenada de los códigos postales.\n", "melt11 = melt1.sort_values('cp')\n", "CP = melt1.cp.reset_index()\n", "CP = CP['cp'].sort_values().unique()\n", "CP" ] }, { "cell_type": "code", "execution_count": null, "id": "98b18585-1a73-4024-94ac-21986a5e2ed9", "metadata": {}, "outputs": [], "source": [ "melt11" ] }, { "cell_type": "code", "execution_count": null, "id": "20c6353d-85d6-4598-9fa1-f508bba359ac", "metadata": {}, "outputs": [], "source": [ "# No se ve muy claro, hay demasiada información.\n", "# Pintamos los datos por código postal\n", "plt.rcParams['figure.figsize'] = (15, 15)\n", "\n", "for a in CP:\n", " datos = melt1[melt1['cp'] == a]\n", " sns.lineplot(x=melt1['fecha'][melt1['cp'] == a], y=melt1['value'], hue=melt1['comercio']).set(title=a)\n", " plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "01c31c2a-98dc-4d39-a534-ab7dacbbed8f", "metadata": {}, "outputs": [], "source": [ "tmp = melt1[(melt1.cp == 48001) | (melt1.cp == 48002)]\n", "plt.rcParams['figure.figsize'] = (25, 25)\n", "g = sns.FacetGrid(tmp, col=\"cp\", height=10)\n", "\n", "sns.lineplot(data=tmp, x='fecha', y='value', hue='comercio')" ] }, { "cell_type": "markdown", "id": "a20db50e-b2e0-49b6-924c-97346d22ddfd", "metadata": {}, "source": [ "## Unión de tablas" ] }, { "cell_type": "code", "execution_count": null, "id": "f7a50657-38c5-4c9b-9cee-56101c129dda", "metadata": {}, "outputs": [], "source": [ "#Creamos los datos\n", "df1 = pd.DataFrame({\n", " 'idCliente':[1,2,3,4,5,6],\n", " 'producto':['Tostadora','Tostadora','Tostadora','Radio','Radio','Radio']})\n", "df2 = pd.DataFrame({\n", " 'idCliente':[2,4,6,8,9],\n", " 'provincia':['Bizkaia', 'Bizkaia','Gipuzkoa','Gipuzkoa','Gipuzkoa']})" ] }, { "cell_type": "code", "execution_count": null, "id": "4ab0c5d1-770d-4195-ab5d-73bd486f171d", "metadata": {}, "outputs": [], "source": [ "print (df1,\"\\n\\n\",df2)" ] }, { "cell_type": "code", "execution_count": null, "id": "6cc08847-2bf7-484d-a4b1-8c84138f7fad", "metadata": {}, "outputs": [], "source": [ "izquierda = pd.merge(\n", " left = df1,\n", " right = df2,\n", " how = 'left',\n", " on = 'idCliente')\n", "izquierda" ] }, { "cell_type": "code", "execution_count": null, "id": "6b811597-4e12-4a7d-9481-54a4c8e655e5", "metadata": {}, "outputs": [], "source": [ "derecha = pd.merge(\n", " left = df1,\n", " right = df2,\n", " how = 'right',\n", " on = 'idCliente')\n", "derecha" ] }, { "cell_type": "code", "execution_count": null, "id": "747393ab-b9d9-40e6-9021-f3ba30366777", "metadata": {}, "outputs": [], "source": [ "inner = pd.merge(\n", " left = df1,\n", " right = df2,\n", " how = 'inner',\n", " on = 'idCliente')\n", "inner" ] }, { "cell_type": "code", "execution_count": null, "id": "3b5117fd-98dc-40fc-9d0c-e4de070c755a", "metadata": {}, "outputs": [], "source": [ "outer = pd.merge(\n", " left = df1,\n", " right = df2,\n", " how = 'outer',\n", " on = 'idCliente')\n", "outer" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 5 }