{ "cells": [ { "cell_type": "markdown", "id": "998bd3b9", "metadata": {}, "source": [ "# Parte 2: Transformación de columnas en DataFrame" ] }, { "cell_type": "code", "execution_count": 1, "id": "f406ade1", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "pd.options.display.float_format = '{:.2f}'.format #Desactivar notación científica en pandas:\n", "np.set_printoptions(suppress=True) #Desactivar notación científica en numpy:\n", "pd.set_option('display.max_columns', None) #comando para mostrar todas las columnas" ] }, { "cell_type": "markdown", "id": "1209aca3", "metadata": {}, "source": [ "Cargamos datos" ] }, { "cell_type": "code", "execution_count": 2, "id": "b3b877d6", "metadata": {}, "outputs": [], "source": [ "cliente = pd.read_csv('./data/dimension_cliente.csv', sep='\\t', encoding='UTF-8')" ] }, { "cell_type": "code", "execution_count": 3, "id": "92825eb3-c9e1-4365-8920-7b38f946098a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idClienteNIFNombreApellido1Apellido2SexoDireccionMunicipioProvinciaCPComunidadMovilTelefonoCorreoNacimiento
018899616EAdriánFernándezReyHPlaza Septentrional Verde 149, 5VitoriaÁlava1440País Vasco684 680 588945 796 001afernandez@correo.es07/09/1974
1243238429 QJoséHernándezDomínguezHPaseo de Estratega Joven 57, 1ºVitoriaÁlava1271País Vasco625265233,0NaNjosehernandez@mail.net20/09/1937
2378747399JuliaRomeroTenaMAvenida Ocasión 83, 1º 5ªVitoriaÁlava1606País Vasco671737062,0NaNjuliaromero51@mail.es28/02/1963
3491929848kRaquelPueblaÁlvarezMCalle Trapezoide Prolijo 84, 3ºVitoriaÁlava1205País VascoNaN945562191,0raquel.puebla@correo.com23/07/1981
4524485203-RAlbertoGomisVázquezHPaseo Hemograma 233, 6AlmansaAlbacete2552Castilla La ManchaNaNNaNagomis@mail.es04/04/1956
\n", "
" ], "text/plain": [ " idCliente NIF Nombre Apellido1 Apellido2 Sexo \\\n", "0 1 8899616E Adrián Fernández Rey H \n", "1 2 43238429 Q José Hernández Domínguez H \n", "2 3 78747399 Julia Romero Tena M \n", "3 4 91929848k Raquel Puebla Álvarez M \n", "4 5 24485203-R Alberto Gomis Vázquez H \n", "\n", " Direccion Municipio Provincia CP \\\n", "0 Plaza Septentrional Verde 149, 5 Vitoria Álava 1440 \n", "1 Paseo de Estratega Joven 57, 1º Vitoria Álava 1271 \n", "2 Avenida Ocasión 83, 1º 5ª Vitoria Álava 1606 \n", "3 Calle Trapezoide Prolijo 84, 3º Vitoria Álava 1205 \n", "4 Paseo Hemograma 233, 6 Almansa Albacete 2552 \n", "\n", " Comunidad Movil Telefono Correo \\\n", "0 País Vasco 684 680 588 945 796 001 afernandez@correo.es \n", "1 País Vasco 625265233,0 NaN josehernandez@mail.net \n", "2 País Vasco 671737062,0 NaN juliaromero51@mail.es \n", "3 País Vasco NaN 945562191,0 raquel.puebla@correo.com \n", "4 Castilla La Mancha NaN NaN agomis@mail.es \n", "\n", " Nacimiento \n", "0 07/09/1974 \n", "1 20/09/1937 \n", "2 28/02/1963 \n", "3 23/07/1981 \n", "4 04/04/1956 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cliente.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "49d49acf-48f8-4082-b233-767952e60304", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(200, 15)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cliente.shape" ] }, { "cell_type": "code", "execution_count": 5, "id": "37794355-8b27-48ed-9807-0fcdf66e15d8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 200 entries, 0 to 199\n", "Data columns (total 15 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 idCliente 200 non-null int64 \n", " 1 NIF 200 non-null object\n", " 2 Nombre 200 non-null object\n", " 3 Apellido1 200 non-null object\n", " 4 Apellido2 200 non-null object\n", " 5 Sexo 200 non-null object\n", " 6 Direccion 200 non-null object\n", " 7 Municipio 200 non-null object\n", " 8 Provincia 200 non-null object\n", " 9 CP 200 non-null int64 \n", " 10 Comunidad 200 non-null object\n", " 11 Movil 116 non-null object\n", " 12 Telefono 82 non-null object\n", " 13 Correo 198 non-null object\n", " 14 Nacimiento 198 non-null object\n", "dtypes: int64(2), object(13)\n", "memory usage: 23.6+ KB\n" ] } ], "source": [ "cliente.info()" ] }, { "cell_type": "markdown", "id": "bd1bea40-9893-4512-bc11-353b565efad3", "metadata": {}, "source": [ "Existen multitud de herramientas que nos van permitir realizar el análisis exploratorio inicial de manera rápida. Una de ellas es Sweetwiz.\n", "https://github.com/fbdesignpro/sweetviz" ] }, { "cell_type": "code", "execution_count": 6, "id": "34c0b73d-87f0-45e4-99c3-cd18dbc25764", "metadata": {}, "outputs": [], "source": [ "import sweetviz" ] }, { "cell_type": "code", "execution_count": 7, "id": "ec156847-2ddb-4713-9a11-674bda8db16c", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7dbb08b9c20d40bba9f077d2a48b99b8", "version_major": 2, "version_minor": 0 }, "text/plain": [ " | | [ 0%] 00:00 -> (? left)" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "informe = sweetviz.analyze(cliente)" ] }, { "cell_type": "code", "execution_count": 8, "id": "ed41b2f1-2810-4796-b5a9-3f5e85908ddb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.\n" ] } ], "source": [ "informe.show_html()" ] }, { "cell_type": "markdown", "id": "245045aa-187e-4163-bac2-b1ea564035c7", "metadata": {}, "source": [ "¿Qué podéis decirme de la variable CP? ¿Tiene sentido que el informe nos devuelva la media, mediana, ...?" ] }, { "cell_type": "markdown", "id": "164f4355-2512-42a5-b1f7-6b005dc68ddb", "metadata": {}, "source": [ "Otra herramienta Pandas Profiling... https://github.com/pandas-profiling/pandas-profiling" ] }, { "cell_type": "code", "execution_count": 9, "id": "7a8b17bc-48d0-4963-87c3-6ea258cfa8a9", "metadata": { "scrolled": true, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: dtale in /home/mydoctor/anaconda3/lib/python3.8/site-packages (1.61.1)\n", "Requirement already satisfied: dash-bootstrap-components in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (1.0.0)\n", "Requirement already satisfied: pandas in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (1.3.4)\n", "Requirement already satisfied: openpyxl in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (3.0.9)\n", "Requirement already satisfied: xlrd in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.7.1)\n", "Requirement already satisfied: networkx in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (2.6.3)\n", "Requirement already satisfied: six in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (1.16.0)\n", "Requirement already satisfied: kaleido in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.2.1)\n", "Requirement already satisfied: statsmodels in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.12.2)\n", "Requirement already satisfied: strsimpy in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.2.1)\n", "Requirement already satisfied: flask-ngrok in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.0.25)\n", "Requirement already satisfied: dash-colorscales in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.0.4)\n", "Requirement already satisfied: future>=0.14.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.18.2)\n", "Requirement already satisfied: packaging<=21.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (21.0)\n", "Requirement already satisfied: lz4 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (3.1.3)\n", "Requirement already satisfied: Flask in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (1.1.2)\n", "Requirement already satisfied: numpy in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (1.20.3)\n", "Requirement already satisfied: missingno<=0.4.2 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.4.2)\n", "Requirement already satisfied: scikit-learn==0.24.2 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.24.2)\n", "Requirement already satisfied: scipy in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (1.7.1)\n", "Requirement already satisfied: itsdangerous in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (2.0.1)\n", "Requirement already satisfied: matplotlib in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (3.4.3)\n", "Requirement already satisfied: et-xmlfile in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (1.1.0)\n", "Requirement already satisfied: plotly>=5.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (5.4.0)\n", "Requirement already satisfied: dash-daq in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.5.0)\n", "Requirement already satisfied: dash>=2.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (2.0.0)\n", "Requirement already satisfied: cycler in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.10.0)\n", "Requirement already satisfied: ppscore in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (1.2.0)\n", "Requirement already satisfied: Flask-Compress in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (1.10.1)\n", "Requirement already satisfied: seaborn in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.11.2)\n", "Requirement already satisfied: squarify in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.4.3)\n", "Requirement already satisfied: xarray in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (0.20.1)\n", "Requirement already satisfied: requests in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dtale) (2.26.0)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from scikit-learn==0.24.2->dtale) (2.2.0)\n", "Requirement already satisfied: joblib>=0.11 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from scikit-learn==0.24.2->dtale) (1.0.1)\n", "Requirement already satisfied: dash-core-components==2.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dash>=2.0.0->dtale) (2.0.0)\n", "Requirement already satisfied: dash-html-components==2.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dash>=2.0.0->dtale) (2.0.0)\n", "Requirement already satisfied: dash-table==5.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from dash>=2.0.0->dtale) (5.0.0)\n", "Requirement already satisfied: Jinja2>=2.10.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from Flask->dtale) (2.11.3)\n", "Requirement already satisfied: Werkzeug>=0.15 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from Flask->dtale) (0.16.1)\n", "Requirement already satisfied: click>=5.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from Flask->dtale) (8.0.3)\n", "Requirement already satisfied: MarkupSafe>=0.23 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from Jinja2>=2.10.1->Flask->dtale) (2.0.1)\n", "Requirement already satisfied: pyparsing>=2.0.2 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from packaging<=21.0->dtale) (3.0.4)\n", "Requirement already satisfied: tenacity>=6.2.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from plotly>=5.0.0->dtale) (8.0.1)\n", "Requirement already satisfied: brotli in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from Flask-Compress->dtale) (1.0.9)\n", "Requirement already satisfied: python-dateutil>=2.7 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from matplotlib->dtale) (2.8.2)\n", "Requirement already satisfied: pillow>=6.2.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from matplotlib->dtale) (8.4.0)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from matplotlib->dtale) (1.3.1)\n", "Requirement already satisfied: pytz>=2017.3 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas->dtale) (2021.3)\n", "Requirement already satisfied: idna<4,>=2.5 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from requests->dtale) (3.2)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from requests->dtale) (2.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from requests->dtale) (2021.10.8)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from requests->dtale) (1.26.7)\n", "Requirement already satisfied: patsy>=0.5 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from statsmodels->dtale) (0.5.2)\n" ] } ], "source": [ "!pip install dtale" ] }, { "cell_type": "code", "execution_count": 10, "id": "f612e600-1ee7-49d6-8459-c03e9930a134", "metadata": {}, "outputs": [], "source": [ "import dtale" ] }, { "cell_type": "code", "execution_count": 11, "id": "fc74c076-af9a-414e-b6de-b2cfbe595eaf", "metadata": {}, "outputs": [], "source": [ "d = dtale.show(cliente)\n", "d.open_browser()" ] }, { "cell_type": "code", "execution_count": 12, "id": "a5385d87-c141-4bb2-b8bc-bac11d703259", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 200 entries, 0 to 199\n", "Data columns (total 15 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 idCliente 200 non-null int64 \n", " 1 NIF 200 non-null object\n", " 2 Nombre 200 non-null object\n", " 3 Apellido1 200 non-null object\n", " 4 Apellido2 200 non-null object\n", " 5 Sexo 200 non-null object\n", " 6 Direccion 200 non-null object\n", " 7 Municipio 200 non-null object\n", " 8 Provincia 200 non-null object\n", " 9 CP 200 non-null int64 \n", " 10 Comunidad 200 non-null object\n", " 11 Movil 116 non-null object\n", " 12 Telefono 82 non-null object\n", " 13 Correo 198 non-null object\n", " 14 Nacimiento 198 non-null object\n", "dtypes: int64(2), object(13)\n", "memory usage: 23.6+ KB\n" ] } ], "source": [ "cliente.info()" ] }, { "cell_type": "code", "execution_count": 13, "id": "7550e28c-ad39-45d5-80a1-5f1d9392df5a", "metadata": { "scrolled": true, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: pandas-profiling[notebook] in /home/mydoctor/anaconda3/lib/python3.8/site-packages (3.1.0)\n", "Requirement already satisfied: phik>=0.11.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (0.12.0)\n", "Requirement already satisfied: numpy>=1.16.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (1.20.3)\n", "Requirement already satisfied: seaborn>=0.10.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (0.11.2)\n", "Requirement already satisfied: missingno>=0.4.2 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (0.4.2)\n", "Requirement already satisfied: requests>=2.24.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (2.26.0)\n", "Requirement already satisfied: tqdm>=4.48.2 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (4.62.3)\n", "Requirement already satisfied: scipy>=1.4.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (1.7.1)\n", "Requirement already satisfied: matplotlib>=3.2.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (3.4.3)\n", "Requirement already satisfied: pydantic>=1.8.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (1.8.2)\n", "Requirement already satisfied: joblib~=1.0.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (1.0.1)\n", "Requirement already satisfied: PyYAML>=5.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (6.0)\n", "Requirement already satisfied: markupsafe~=2.0.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (2.0.1)\n", "Requirement already satisfied: visions[type_image_path]==0.7.4 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (0.7.4)\n", "Requirement already satisfied: pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (1.3.4)\n", "Requirement already satisfied: htmlmin>=0.1.12 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (0.1.12)\n", "Requirement already satisfied: jinja2>=2.11.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (2.11.3)\n", "Requirement already satisfied: multimethod>=1.4 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (1.6)\n", "Requirement already satisfied: tangled-up-in-unicode==0.1.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (0.1.0)\n", "Requirement already satisfied: jupyter-core>=4.6.3 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (4.8.1)\n", "Requirement already satisfied: jupyter-client>=6.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (6.1.12)\n", "Requirement already satisfied: ipywidgets>=7.5.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas-profiling[notebook]) (7.6.5)\n", "Requirement already satisfied: attrs>=19.3.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from visions[type_image_path]==0.7.4->pandas-profiling[notebook]) (21.2.0)\n", "Requirement already satisfied: networkx>=2.4 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from visions[type_image_path]==0.7.4->pandas-profiling[notebook]) (2.6.3)\n", "Requirement already satisfied: imagehash in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from visions[type_image_path]==0.7.4->pandas-profiling[notebook]) (4.2.1)\n", "Requirement already satisfied: Pillow in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from visions[type_image_path]==0.7.4->pandas-profiling[notebook]) (8.4.0)\n", "Requirement already satisfied: jupyterlab-widgets>=1.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.0.0)\n", "Requirement already satisfied: traitlets>=4.3.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipywidgets>=7.5.1->pandas-profiling[notebook]) (5.1.0)\n", "Requirement already satisfied: widgetsnbextension~=3.5.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipywidgets>=7.5.1->pandas-profiling[notebook]) (3.5.1)\n", "Requirement already satisfied: ipython-genutils~=0.2.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.2.0)\n", "Requirement already satisfied: ipython>=4.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipywidgets>=7.5.1->pandas-profiling[notebook]) (7.29.0)\n", "Requirement already satisfied: ipykernel>=4.5.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipywidgets>=7.5.1->pandas-profiling[notebook]) (6.4.1)\n", "Requirement already satisfied: nbformat>=4.2.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipywidgets>=7.5.1->pandas-profiling[notebook]) (5.1.3)\n", "Requirement already satisfied: matplotlib-inline<0.2.0,>=0.1.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.1.2)\n", "Requirement already satisfied: debugpy<2.0,>=1.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.4.1)\n", "Requirement already satisfied: tornado<7.0,>=4.2 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling[notebook]) (6.1)\n", "Requirement already satisfied: setuptools>=18.5 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (58.0.4)\n", "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (3.0.20)\n", "Requirement already satisfied: pexpect>4.3 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (4.8.0)\n", "Requirement already satisfied: jedi>=0.16 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.18.0)\n", "Requirement already satisfied: pickleshare in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.7.5)\n", "Requirement already satisfied: decorator in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (5.1.0)\n", "Requirement already satisfied: pygments in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (2.10.0)\n", "Requirement already satisfied: backcall in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.2.0)\n", "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from jedi>=0.16->ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.8.2)\n", "Requirement already satisfied: python-dateutil>=2.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from jupyter-client>=6.0.0->pandas-profiling[notebook]) (2.8.2)\n", "Requirement already satisfied: pyzmq>=13 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from jupyter-client>=6.0.0->pandas-profiling[notebook]) (22.2.1)\n", "Requirement already satisfied: cycler>=0.10 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from matplotlib>=3.2.0->pandas-profiling[notebook]) (0.10.0)\n", "Requirement already satisfied: pyparsing>=2.2.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from matplotlib>=3.2.0->pandas-profiling[notebook]) (3.0.4)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from matplotlib>=3.2.0->pandas-profiling[notebook]) (1.3.1)\n", "Requirement already satisfied: six in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from cycler>=0.10->matplotlib>=3.2.0->pandas-profiling[notebook]) (1.16.0)\n", "Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from nbformat>=4.2.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (3.2.0)\n", "Requirement already satisfied: pyrsistent>=0.14.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.18.0)\n", "Requirement already satisfied: pytz>=2017.3 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3->pandas-profiling[notebook]) (2021.3)\n", "Requirement already satisfied: ptyprocess>=0.5 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pexpect>4.3->ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.7.0)\n", "Requirement already satisfied: wcwidth in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.2.5)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from pydantic>=1.8.1->pandas-profiling[notebook]) (3.10.0.2)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from requests>=2.24.0->pandas-profiling[notebook]) (1.26.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from requests>=2.24.0->pandas-profiling[notebook]) (2021.10.8)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from requests>=2.24.0->pandas-profiling[notebook]) (2.0.4)\n", "Requirement already satisfied: idna<4,>=2.5 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from requests>=2.24.0->pandas-profiling[notebook]) (3.2)\n", "Requirement already satisfied: notebook>=4.4.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (6.4.5)\n", "Requirement already satisfied: argon2-cffi in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (20.1.0)\n", "Requirement already satisfied: nbconvert in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (6.1.0)\n", "Requirement already satisfied: terminado>=0.8.3 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.9.4)\n", "Requirement already satisfied: Send2Trash>=1.5.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.8.0)\n", "Requirement already satisfied: prometheus-client in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.11.0)\n", "Requirement already satisfied: cffi>=1.0.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.14.6)\n", "Requirement already satisfied: pycparser in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from cffi>=1.0.0->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (2.20)\n", "Requirement already satisfied: PyWavelets in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from imagehash->visions[type_image_path]==0.7.4->pandas-profiling[notebook]) (1.1.1)\n", "Requirement already satisfied: mistune<2,>=0.8.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.8.4)\n", "Requirement already satisfied: testpath in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.5.0)\n", "Requirement already satisfied: jupyterlab-pygments in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.1.2)\n", "Requirement already satisfied: nbclient<0.6.0,>=0.5.0 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.5.3)\n", "Requirement already satisfied: entrypoints>=0.2.2 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.3)\n", "Requirement already satisfied: bleach in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (4.0.0)\n", "Requirement already satisfied: defusedxml in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.7.1)\n", "Requirement already satisfied: pandocfilters>=1.4.1 in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.4.3)\n", "Requirement already satisfied: nest-asyncio in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.5.1)\n", "Requirement already satisfied: async-generator in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.10)\n", "Requirement already satisfied: packaging in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (21.0)\n", "Requirement already satisfied: webencodings in /home/mydoctor/anaconda3/lib/python3.8/site-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.5.1)\n" ] } ], "source": [ "!pip install pandas-profiling[notebook]" ] }, { "cell_type": "markdown", "id": "12c1fad2-d971-483f-a7b7-3329e2469f65", "metadata": {}, "source": [ "Hay muchas mas librerías para la visualización de datos (Autoviz, D-Tale, VAEX, ...)" ] }, { "cell_type": "code", "execution_count": 14, "id": "6490756b-02a8-42f6-a32e-0f2f79e1b120", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NIFNombreApellido1
08899616EAdriánFernández
143238429 QJoséHernández
278747399JuliaRomero
391929848kRaquelPuebla
424485203-RAlbertoGomis
\n", "
" ], "text/plain": [ " NIF Nombre Apellido1\n", "0 8899616E Adrián Fernández\n", "1 43238429 Q José Hernández\n", "2 78747399 Julia Romero\n", "3 91929848k Raquel Puebla\n", "4 24485203-R Alberto Gomis" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Selección de columnas determinadas y todas las filas\n", "seleccion = cliente.iloc [:,1:4]\n", "seleccion.head()" ] }, { "cell_type": "code", "execution_count": 15, "id": "d4da9f3d-0da9-43fe-b171-0251f0c91c9f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NIFNombreApellido2Apellido1
08899616EAdriánReyFernández
143238429 QJoséDomínguezHernández
278747399JuliaTenaRomero
391929848kRaquelÁlvarezPuebla
424485203-RAlbertoVázquezGomis
\n", "
" ], "text/plain": [ " NIF Nombre Apellido2 Apellido1\n", "0 8899616E Adrián Rey Fernández\n", "1 43238429 Q José Domínguez Hernández\n", "2 78747399 Julia Tena Romero\n", "3 91929848k Raquel Álvarez Puebla\n", "4 24485203-R Alberto Vázquez Gomis" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Selección de columnas determinadas y todas las filas\n", "seleccion = cliente.iloc [:,[1,2,4,3]]\n", "seleccion.head()" ] }, { "cell_type": "code", "execution_count": 16, "id": "140ce720-2bde-4981-a600-f67d40252a2b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NIFNombreApellido2
08899616EAdriánRey
143238429 QJoséDomínguez
278747399JuliaTena
391929848kRaquelÁlvarez
424485203-RAlbertoVázquez
\n", "
" ], "text/plain": [ " NIF Nombre Apellido2\n", "0 8899616E Adrián Rey\n", "1 43238429 Q José Domínguez\n", "2 78747399 Julia Tena\n", "3 91929848k Raquel Álvarez\n", "4 24485203-R Alberto Vázquez" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Selección de columnas determinadas y todas las filas\n", "seleccion = cliente [['NIF','Nombre','Apellido2']]\n", "seleccion.head()" ] }, { "cell_type": "code", "execution_count": 17, "id": "756273bd-63cc-42e3-a8b8-b0aabde867df", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NIFNombreApellido2
143238429 QJoséDomínguez
278747399JuliaTena
391929848kRaquelÁlvarez
424485203-RAlbertoVázquez
539375059WAntoniaOsuna
690807766-VJuanMillán
767303411yLauraConde
809153621NAna MaríaMejía
958807708-CInmaculadaSánchez
\n", "
" ], "text/plain": [ " NIF Nombre Apellido2\n", "1 43238429 Q José Domínguez\n", "2 78747399 Julia Tena\n", "3 91929848k Raquel Álvarez\n", "4 24485203-R Alberto Vázquez\n", "5 39375059W Antonia Osuna\n", "6 90807766-V Juan Millán\n", "7 67303411y Laura Conde\n", "8 09153621N Ana María Mejía\n", "9 58807708-C Inmaculada Sánchez" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Selección de columnas y filas determinadas\n", "seleccion = cliente.iloc [1:10,[1,2,4]]\n", "seleccion" ] }, { "cell_type": "code", "execution_count": 18, "id": "0897543d-9572-479c-be4e-33f561351740", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'30/10/1976'" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Selección de la última posición del dataframe\n", "seleccion = cliente.iloc [-1,-1]\n", "seleccion" ] }, { "cell_type": "code", "execution_count": 19, "id": "d4274b7b-51c8-44ac-a809-2a153a8d1b49", "metadata": {}, "outputs": [], "source": [ "# Selección de columnas según criterio\n", "\n", "# La selección de columnas la realizamos mediante un array de True/False\n", "filtro = cliente.columns.str.startswith('C')\n", "seleccion = cliente.loc[:, filtro]" ] }, { "cell_type": "code", "execution_count": 20, "id": "9904b59e-8124-4cc1-b62e-5794c32f79aa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([False, False, False, False, False, False, False, False, False,\n", " True, True, False, False, True, False])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtro" ] }, { "cell_type": "code", "execution_count": 21, "id": "5c3085cd-0c3f-468a-8db4-f1b7f27c80b4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CPComunidadCorreo
01440País Vascoafernandez@correo.es
11271País Vascojosehernandez@mail.net
21606País Vascojuliaromero51@mail.es
31205País Vascoraquel.puebla@correo.com
42552Castilla La Manchaagomis@mail.es
............
19548266País Vascomargarcia@email.com
19648660País Vascorodriguez11@correo.es
19750832Aragóncarmencastelo@correo.net
19850366Aragónmaribenitez62@mail.com
19950624Aragónguerrero50@email.es
\n", "

200 rows × 3 columns

\n", "
" ], "text/plain": [ " CP Comunidad Correo\n", "0 1440 País Vasco afernandez@correo.es\n", "1 1271 País Vasco josehernandez@mail.net\n", "2 1606 País Vasco juliaromero51@mail.es\n", "3 1205 País Vasco raquel.puebla@correo.com\n", "4 2552 Castilla La Mancha agomis@mail.es\n", ".. ... ... ...\n", "195 48266 País Vasco margarcia@email.com\n", "196 48660 País Vasco rodriguez11@correo.es\n", "197 50832 Aragón carmencastelo@correo.net\n", "198 50366 Aragón maribenitez62@mail.com\n", "199 50624 Aragón guerrero50@email.es\n", "\n", "[200 rows x 3 columns]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "seleccion" ] }, { "cell_type": "code", "execution_count": 22, "id": "f20ef705-8795-481e-8bd7-d4693b1e9ff9", "metadata": {}, "outputs": [], "source": [ "# Selección de datos en función del sexo.\n", "# Devuelve un array de True/False con los registros\n", "# que cumplen la condición\n", "filtro = cliente['Sexo'] == \"H\"" ] }, { "cell_type": "code", "execution_count": 23, "id": "d9e8c46d-ed12-44f8-a0d7-454a5886afdb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 True\n", "1 True\n", "2 False\n", "3 False\n", "4 True\n", " ... \n", "195 False\n", "196 False\n", "197 False\n", "198 False\n", "199 True\n", "Name: Sexo, Length: 200, dtype: bool" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtro" ] }, { "cell_type": "code", "execution_count": 24, "id": "ba92597d-eb3f-45a7-996a-1a6e6f0a5ce9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True 118\n", "False 82\n", "Name: Sexo, dtype: int64" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtro.value_counts()" ] }, { "cell_type": "code", "execution_count": 25, "id": "d0a5b909-1bfa-4a0c-bcee-24a72314d992", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idClienteNIFNombreApellido1Apellido2SexoDireccionMunicipioProvinciaCPComunidadMovilTelefonoCorreoNacimiento
018899616EAdriánFernándezReyHPlaza Septentrional Verde 149, 5VitoriaÁlava1440País Vasco684 680 588945 796 001afernandez@correo.es07/09/1974
1243238429 QJoséHernándezDomínguezHPaseo de Estratega Joven 57, 1ºVitoriaÁlava1271País Vasco625265233,0NaNjosehernandez@mail.net20/09/1937
4524485203-RAlbertoGomisVázquezHPaseo Hemograma 233, 6AlmansaAlbacete2552Castilla La ManchaNaNNaNagomis@mail.es04/04/1956
6790807766-VJuanRodríguezMillánHPlaza Preservación 246, 4ºHellínAlbacete2865Castilla La Mancha654999998,0967609980,0rodriguez87@correo.net20/02/1988
101124940055MJesúsMárquezPalaciosHPlaza del Plancton 44, 5º 3ªSanta PolaAlicante3777Valencia656012630,0963296037,0marquez35@correo.net11/09/1973
\n", "
" ], "text/plain": [ " idCliente NIF Nombre Apellido1 Apellido2 Sexo \\\n", "0 1 8899616E Adrián Fernández Rey H \n", "1 2 43238429 Q José Hernández Domínguez H \n", "4 5 24485203-R Alberto Gomis Vázquez H \n", "6 7 90807766-V Juan Rodríguez Millán H \n", "10 11 24940055M Jesús Márquez Palacios H \n", "\n", " Direccion Municipio Provincia CP \\\n", "0 Plaza Septentrional Verde 149, 5 Vitoria Álava 1440 \n", "1 Paseo de Estratega Joven 57, 1º Vitoria Álava 1271 \n", "4 Paseo Hemograma 233, 6 Almansa Albacete 2552 \n", "6 Plaza Preservación 246, 4º Hellín Albacete 2865 \n", "10 Plaza del Plancton 44, 5º 3ª Santa Pola Alicante 3777 \n", "\n", " Comunidad Movil Telefono Correo \\\n", "0 País Vasco 684 680 588 945 796 001 afernandez@correo.es \n", "1 País Vasco 625265233,0 NaN josehernandez@mail.net \n", "4 Castilla La Mancha NaN NaN agomis@mail.es \n", "6 Castilla La Mancha 654999998,0 967609980,0 rodriguez87@correo.net \n", "10 Valencia 656012630,0 963296037,0 marquez35@correo.net \n", "\n", " Nacimiento \n", "0 07/09/1974 \n", "1 20/09/1937 \n", "4 04/04/1956 \n", "6 20/02/1988 \n", "10 11/09/1973 " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "seleccion = cliente.loc [filtro,:]\n", "seleccion.head()" ] }, { "cell_type": "markdown", "id": "0e226709-ae2c-43a5-98b6-0aa06bb0300f", "metadata": {}, "source": [ "Se supone que en selección ahora, no debería hacer registros con sexo M. Lo comprobamos..." ] }, { "cell_type": "code", "execution_count": 26, "id": "aed3d688-6956-450e-820a-b2b4022ed333", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Series([], dtype: int64)" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtro = seleccion ['Sexo'] == \"M\"\n", "seleccion.loc [filtro, :].value_counts()" ] }, { "cell_type": "markdown", "id": "641ad6cb-4e95-4d96-875d-07310b77b6b1", "metadata": {}, "source": [ "También podemos filtrar por \"algo\"" ] }, { "cell_type": "code", "execution_count": 27, "id": "d45d4eb3-5e19-445d-aa68-ae2d435320f6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idClienteNIFNombreApellido1Apellido2SexoDireccionMunicipioProvinciaCPComunidadMovilTelefonoCorreoNacimiento
018899616EAdriánFernándezReyHPlaza Septentrional Verde 149, 5VitoriaÁlava1440País Vasco684 680 588945 796 001afernandez@correo.es07/09/1974
1243238429 QJoséHernándezDomínguezHPaseo de Estratega Joven 57, 1ºVitoriaÁlava1271País Vasco625265233,0NaNjosehernandez@mail.net20/09/1937
2378747399JuliaRomeroTenaMAvenida Ocasión 83, 1º 5ªVitoriaÁlava1606País Vasco671737062,0NaNjuliaromero51@mail.es28/02/1963
3491929848kRaquelPueblaÁlvarezMCalle Trapezoide Prolijo 84, 3ºVitoriaÁlava1205País VascoNaN945562191,0raquel.puebla@correo.com23/07/1981
4524485203-RAlbertoGomisVázquezHPaseo Hemograma 233, 6AlmansaAlbacete2552Castilla La ManchaNaNNaNagomis@mail.es04/04/1956
\n", "
" ], "text/plain": [ " idCliente NIF Nombre Apellido1 Apellido2 Sexo \\\n", "0 1 8899616E Adrián Fernández Rey H \n", "1 2 43238429 Q José Hernández Domínguez H \n", "2 3 78747399 Julia Romero Tena M \n", "3 4 91929848k Raquel Puebla Álvarez M \n", "4 5 24485203-R Alberto Gomis Vázquez H \n", "\n", " Direccion Municipio Provincia CP \\\n", "0 Plaza Septentrional Verde 149, 5 Vitoria Álava 1440 \n", "1 Paseo de Estratega Joven 57, 1º Vitoria Álava 1271 \n", "2 Avenida Ocasión 83, 1º 5ª Vitoria Álava 1606 \n", "3 Calle Trapezoide Prolijo 84, 3º Vitoria Álava 1205 \n", "4 Paseo Hemograma 233, 6 Almansa Albacete 2552 \n", "\n", " Comunidad Movil Telefono Correo \\\n", "0 País Vasco 684 680 588 945 796 001 afernandez@correo.es \n", "1 País Vasco 625265233,0 NaN josehernandez@mail.net \n", "2 País Vasco 671737062,0 NaN juliaromero51@mail.es \n", "3 País Vasco NaN 945562191,0 raquel.puebla@correo.com \n", "4 Castilla La Mancha NaN NaN agomis@mail.es \n", "\n", " Nacimiento \n", "0 07/09/1974 \n", "1 20/09/1937 \n", "2 28/02/1963 \n", "3 23/07/1981 \n", "4 04/04/1956 " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtro = cliente ['idCliente'] < 50\n", "# filtro = cliente.idCliente < 50\n", "seleccion = cliente.loc [filtro,:]\n", "seleccion.head()" ] }, { "cell_type": "markdown", "id": "5160bdf9-6108-44d7-a114-296177c975d5", "metadata": {}, "source": [ "Miramos los idCliente únicos" ] }, { "cell_type": "code", "execution_count": 28, "id": "d441365e-0792-45f7-97ca-51e1a7c8536d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,\n", " 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,\n", " 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,\n", " 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,\n", " 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,\n", " 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,\n", " 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,\n", " 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,\n", " 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,\n", " 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,\n", " 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,\n", " 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,\n", " 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,\n", " 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,\n", " 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,\n", " 196, 197, 198, 199, 200])" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cliente['idCliente'].unique()" ] }, { "cell_type": "markdown", "id": "833c1737-f42c-4711-9ad9-8888f950519d", "metadata": {}, "source": [ "¿Se os ocurre una manera rápida de ver si en el array previo faltan clientes?" ] }, { "cell_type": "code", "execution_count": 29, "id": "b60f5d1c-8a62-4f16-a169-251f6a7b1ad6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(seleccion['idCliente'].unique()) >= len(cliente['idCliente'].unique())" ] }, { "cell_type": "markdown", "id": "dcc826c2-9b6e-4917-84ab-c6fb48400a31", "metadata": {}, "source": [ "### Asignación de valores en función de otra/s columna/s" ] }, { "cell_type": "code", "execution_count": 30, "id": "f2df94a1-87d8-4be2-bc74-320f02b02a35", "metadata": {}, "outputs": [], "source": [ "cliente['nuevacolumna'] = \"Mujer\"" ] }, { "cell_type": "code", "execution_count": 31, "id": "27160242-745f-4bf6-977f-5866f0568393", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sexonuevacolumna
0HHombre
1HHombre
2MMujer
3MMujer
4HHombre
.........
195MMujer
196MMujer
197MMujer
198MMujer
199HHombre
\n", "

200 rows × 2 columns

\n", "
" ], "text/plain": [ " Sexo nuevacolumna\n", "0 H Hombre\n", "1 H Hombre\n", "2 M Mujer\n", "3 M Mujer\n", "4 H Hombre\n", ".. ... ...\n", "195 M Mujer\n", "196 M Mujer\n", "197 M Mujer\n", "198 M Mujer\n", "199 H Hombre\n", "\n", "[200 rows x 2 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtro = cliente['Sexo'] == 'H'\n", "cliente.loc [filtro, 'nuevacolumna'] = 'Hombre'\n", "cliente [['Sexo','nuevacolumna']]" ] }, { "cell_type": "markdown", "id": "cf9f07b8-9be3-4957-800f-8ea87f468527", "metadata": {}, "source": [ "Dividimos el dataframe en 2 en función de un criterio. En este caso vamos a dividirlo en base al sexo... (otro método)" ] }, { "cell_type": "code", "execution_count": 32, "id": "911bdf81-6183-4a0f-a8a9-58b51a5ab5b2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "H 118\n", "Name: Sexo, dtype: int64\n", "M 82\n", "Name: Sexo, dtype: int64\n" ] } ], "source": [ "hombres, mujeres = cliente [filtro], cliente [-filtro]\n", "print (hombres.Sexo.value_counts())\n", "print (mujeres.Sexo.value_counts())" ] }, { "cell_type": "markdown", "id": "7b47e705-e221-42af-8d80-f0b739bf76ba", "metadata": {}, "source": [ "Lo complicamos un poco más, cogiendo una columna con más de 2 ocurrencias. En este caso, dividimos los datos por Comunidad" ] }, { "cell_type": "code", "execution_count": 33, "id": "0bd352cf-73e0-4b85-b060-12edde61eb8a", "metadata": {}, "outputs": [], "source": [ "datosPorComunidad = {}\n", "\n", "for a in cliente['Comunidad'].unique():\n", "\n", " datosPorComunidad[a] = cliente[cliente['Comunidad'] == a]\n" ] }, { "cell_type": "markdown", "id": "8a29c020-d616-4329-886d-898761279c21", "metadata": {}, "source": [ "Ahora tenemos tantos diccionarios como comunidades tenemos en nuestros datos. Para extraer cualquiera de dichos diccionarios..." ] }, { "cell_type": "code", "execution_count": 34, "id": "86930382-3160-4601-b5e4-cfd01a65ccdf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idClienteNIFNombreApellido1Apellido2SexoDireccionMunicipioProvinciaCPComunidadMovilTelefonoCorreoNacimientonuevacolumna
018899616EAdriánFernándezReyHPlaza Septentrional Verde 149, 5VitoriaÁlava1440País Vasco684 680 588945 796 001afernandez@correo.es07/09/1974Hombre
1243238429 QJoséHernándezDomínguezHPaseo de Estratega Joven 57, 1ºVitoriaÁlava1271País Vasco625265233,0NaNjosehernandez@mail.net20/09/1937Hombre
2378747399JuliaRomeroTenaMAvenida Ocasión 83, 1º 5ªVitoriaÁlava1606País Vasco671737062,0NaNjuliaromero51@mail.es28/02/1963Mujer
3491929848kRaquelPueblaÁlvarezMCalle Trapezoide Prolijo 84, 3ºVitoriaÁlava1205País VascoNaN945562191,0raquel.puebla@correo.com23/07/1981Mujer
19319412886287PMartaRamisMuñozMPaseo de lo Abovedado Audaz 142, 1LejonaVizcaya48655País Vasco642495170,0NaNmarramis62@email.net06/10/1953Mujer
\n", "
" ], "text/plain": [ " idCliente NIF Nombre Apellido1 Apellido2 Sexo \\\n", "0 1 8899616E Adrián Fernández Rey H \n", "1 2 43238429 Q José Hernández Domínguez H \n", "2 3 78747399 Julia Romero Tena M \n", "3 4 91929848k Raquel Puebla Álvarez M \n", "193 194 12886287P Marta Ramis Muñoz M \n", "\n", " Direccion Municipio Provincia CP \\\n", "0 Plaza Septentrional Verde 149, 5 Vitoria Álava 1440 \n", "1 Paseo de Estratega Joven 57, 1º Vitoria Álava 1271 \n", "2 Avenida Ocasión 83, 1º 5ª Vitoria Álava 1606 \n", "3 Calle Trapezoide Prolijo 84, 3º Vitoria Álava 1205 \n", "193 Paseo de lo Abovedado Audaz 142, 1 Lejona Vizcaya 48655 \n", "\n", " Comunidad Movil Telefono Correo \\\n", "0 País Vasco 684 680 588 945 796 001 afernandez@correo.es \n", "1 País Vasco 625265233,0 NaN josehernandez@mail.net \n", "2 País Vasco 671737062,0 NaN juliaromero51@mail.es \n", "3 País Vasco NaN 945562191,0 raquel.puebla@correo.com \n", "193 País Vasco 642495170,0 NaN marramis62@email.net \n", "\n", " Nacimiento nuevacolumna \n", "0 07/09/1974 Hombre \n", "1 20/09/1937 Hombre \n", "2 28/02/1963 Mujer \n", "3 23/07/1981 Mujer \n", "193 06/10/1953 Mujer " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "datosPorComunidad ['País Vasco'].head()" ] }, { "cell_type": "code", "execution_count": 35, "id": "f1daf4dc-a4d0-421c-95ec-44d92da8dd36", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idClienteNIFNombreApellido1Apellido2SexoDireccionMunicipioProvinciaCPComunidadMovilTelefonoCorreoNacimientonuevacolumna
282924352512CAlbertoMendozaMartinHAvenida la Concepción Marrón 174, 4ºBarcelonaBarcelona8270Cataluña652 111 374NaNalb.mendoza@correo.com04/12/1949Hombre
293044166614JAnaLilloPardoMCalle Verboides Gentil 180, 4BarcelonaBarcelona8665CataluñaNaNNaNana.lillo@mail.es07/09/1960Mujer
303116016967CAna IsabelGarcésHernándezMCalle Plancton Aurífero 125, 4BarcelonaBarcelona8466CataluñaNaN934 982 785angarces@correo.com30/05/1988Mujer
313284963229XAna IsabelSalvaRuizMPaseo del Verbo Lento 16, 4ºBarcelonaBarcelona8338Cataluña649 345 787934 355 869ansalva98@email.net19/09/1989Mujer
323301164391QAndreaSolorzanoAbascalMAvenida de Derivación 286, 4BarcelonaBarcelona8945Cataluña603 298 288934 745 303andreasolorzano71@mail.net29/01/1977Mujer
\n", "
" ], "text/plain": [ " idCliente NIF Nombre Apellido1 Apellido2 Sexo \\\n", "28 29 24352512C Alberto Mendoza Martin H \n", "29 30 44166614J Ana Lillo Pardo M \n", "30 31 16016967C Ana Isabel Garcés Hernández M \n", "31 32 84963229X Ana Isabel Salva Ruiz M \n", "32 33 01164391Q Andrea Solorzano Abascal M \n", "\n", " Direccion Municipio Provincia CP \\\n", "28 Avenida la Concepción Marrón 174, 4º Barcelona Barcelona 8270 \n", "29 Calle Verboides Gentil 180, 4 Barcelona Barcelona 8665 \n", "30 Calle Plancton Aurífero 125, 4 Barcelona Barcelona 8466 \n", "31 Paseo del Verbo Lento 16, 4º Barcelona Barcelona 8338 \n", "32 Avenida de Derivación 286, 4 Barcelona Barcelona 8945 \n", "\n", " Comunidad Movil Telefono Correo \\\n", "28 Cataluña 652 111 374 NaN alb.mendoza@correo.com \n", "29 Cataluña NaN NaN ana.lillo@mail.es \n", "30 Cataluña NaN 934 982 785 angarces@correo.com \n", "31 Cataluña 649 345 787 934 355 869 ansalva98@email.net \n", "32 Cataluña 603 298 288 934 745 303 andreasolorzano71@mail.net \n", "\n", " Nacimiento nuevacolumna \n", "28 04/12/1949 Hombre \n", "29 07/09/1960 Mujer \n", "30 30/05/1988 Mujer \n", "31 19/09/1989 Mujer \n", "32 29/01/1977 Mujer " ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "datosPorComunidad ['Cataluña'].head()" ] }, { "cell_type": "code", "execution_count": 36, "id": "cae261cb-1e46-47b3-ae61-bca3e91caeb7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Los datos segmentados están en formato diccionario\n", "type(datosPorComunidad)" ] }, { "cell_type": "code", "execution_count": 37, "id": "44e0c70a-5f23-442d-bea2-14bf46815a84", "metadata": {}, "outputs": [], "source": [ "# Podemos convertir los datos a dataframe\n", "PaisVasco = pd.DataFrame(datosPorComunidad['País Vasco'])" ] }, { "cell_type": "code", "execution_count": 38, "id": "34ad1946-46d3-4057-a6a8-79291248f9f5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idClienteNIFNombreApellido1Apellido2SexoDireccionMunicipioProvinciaCPComunidadMovilTelefonoCorreoNacimientonuevacolumna
018899616EAdriánFernándezReyHPlaza Septentrional Verde 149, 5VitoriaÁlava1440País Vasco684 680 588945 796 001afernandez@correo.es07/09/1974Hombre
1243238429 QJoséHernándezDomínguezHPaseo de Estratega Joven 57, 1ºVitoriaÁlava1271País Vasco625265233,0NaNjosehernandez@mail.net20/09/1937Hombre
2378747399JuliaRomeroTenaMAvenida Ocasión 83, 1º 5ªVitoriaÁlava1606País Vasco671737062,0NaNjuliaromero51@mail.es28/02/1963Mujer
3491929848kRaquelPueblaÁlvarezMCalle Trapezoide Prolijo 84, 3ºVitoriaÁlava1205País VascoNaN945562191,0raquel.puebla@correo.com23/07/1981Mujer
19319412886287PMartaRamisMuñozMPaseo de lo Abovedado Audaz 142, 1LejonaVizcaya48655País Vasco642495170,0NaNmarramis62@email.net06/10/1953Mujer
\n", "
" ], "text/plain": [ " idCliente NIF Nombre Apellido1 Apellido2 Sexo \\\n", "0 1 8899616E Adrián Fernández Rey H \n", "1 2 43238429 Q José Hernández Domínguez H \n", "2 3 78747399 Julia Romero Tena M \n", "3 4 91929848k Raquel Puebla Álvarez M \n", "193 194 12886287P Marta Ramis Muñoz M \n", "\n", " Direccion Municipio Provincia CP \\\n", "0 Plaza Septentrional Verde 149, 5 Vitoria Álava 1440 \n", "1 Paseo de Estratega Joven 57, 1º Vitoria Álava 1271 \n", "2 Avenida Ocasión 83, 1º 5ª Vitoria Álava 1606 \n", "3 Calle Trapezoide Prolijo 84, 3º Vitoria Álava 1205 \n", "193 Paseo de lo Abovedado Audaz 142, 1 Lejona Vizcaya 48655 \n", "\n", " Comunidad Movil Telefono Correo \\\n", "0 País Vasco 684 680 588 945 796 001 afernandez@correo.es \n", "1 País Vasco 625265233,0 NaN josehernandez@mail.net \n", "2 País Vasco 671737062,0 NaN juliaromero51@mail.es \n", "3 País Vasco NaN 945562191,0 raquel.puebla@correo.com \n", "193 País Vasco 642495170,0 NaN marramis62@email.net \n", "\n", " Nacimiento nuevacolumna \n", "0 07/09/1974 Hombre \n", "1 20/09/1937 Hombre \n", "2 28/02/1963 Mujer \n", "3 23/07/1981 Mujer \n", "193 06/10/1953 Mujer " ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "PaisVasco.head()" ] }, { "cell_type": "markdown", "id": "bb695eae-461a-4e79-a265-ec32c2db80ba", "metadata": {}, "source": [ "Otra manera de segmentar los datos, en este caso agruparemos los datos a través de una columna (y obtendremos un dataframe)" ] }, { "cell_type": "code", "execution_count": 39, "id": "306b42b3-65a1-4994-a4e8-513681de6110", "metadata": {}, "outputs": [], "source": [ "datosPorComunidad = cliente.groupby(cliente['Comunidad'])" ] }, { "cell_type": "code", "execution_count": 40, "id": "6d3f9830-b08a-40c2-95bf-9bc09e6f7716", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idClienteNIFNombreApellido1Apellido2SexoDireccionMunicipioProvinciaCPComunidadMovilTelefonoCorreoNacimientonuevacolumna
018899616EAdriánFernándezReyHPlaza Septentrional Verde 149, 5VitoriaÁlava1440País Vasco684 680 588945 796 001afernandez@correo.es07/09/1974Hombre
1243238429 QJoséHernándezDomínguezHPaseo de Estratega Joven 57, 1ºVitoriaÁlava1271País Vasco625265233,0NaNjosehernandez@mail.net20/09/1937Hombre
2378747399JuliaRomeroTenaMAvenida Ocasión 83, 1º 5ªVitoriaÁlava1606País Vasco671737062,0NaNjuliaromero51@mail.es28/02/1963Mujer
3491929848kRaquelPueblaÁlvarezMCalle Trapezoide Prolijo 84, 3ºVitoriaÁlava1205País VascoNaN945562191,0raquel.puebla@correo.com23/07/1981Mujer
19319412886287PMartaRamisMuñozMPaseo de lo Abovedado Audaz 142, 1LejonaVizcaya48655País Vasco642495170,0NaNmarramis62@email.net06/10/1953Mujer
19419548712440GMaría JoséMartínezSerraMCalle de Reconocimiento Deportivo 135, 6ºBilbaoVizcaya48133País Vasco690747457,0943785677,0martinez44@mail.es28/12/1972Mujer
19519688501155FMaría RosarioGarcíaRiberaMAvenida Flagelo 80, 1º 6ªBilbaoVizcaya48266País Vasco608292972,0NaNmargarcia@email.com22/08/1948Mujer
19619744525818APurificaciónRodríguezTorresMPlaza Verbo 258, 2º 4ªBilbaoVizcaya48660País VascoNaNNaNrodriguez11@correo.es01/05/1945Mujer
\n", "
" ], "text/plain": [ " idCliente NIF Nombre Apellido1 Apellido2 Sexo \\\n", "0 1 8899616E Adrián Fernández Rey H \n", "1 2 43238429 Q José Hernández Domínguez H \n", "2 3 78747399 Julia Romero Tena M \n", "3 4 91929848k Raquel Puebla Álvarez M \n", "193 194 12886287P Marta Ramis Muñoz M \n", "194 195 48712440G María José Martínez Serra M \n", "195 196 88501155F María Rosario García Ribera M \n", "196 197 44525818A Purificación Rodríguez Torres M \n", "\n", " Direccion Municipio Provincia CP \\\n", "0 Plaza Septentrional Verde 149, 5 Vitoria Álava 1440 \n", "1 Paseo de Estratega Joven 57, 1º Vitoria Álava 1271 \n", "2 Avenida Ocasión 83, 1º 5ª Vitoria Álava 1606 \n", "3 Calle Trapezoide Prolijo 84, 3º Vitoria Álava 1205 \n", "193 Paseo de lo Abovedado Audaz 142, 1 Lejona Vizcaya 48655 \n", "194 Calle de Reconocimiento Deportivo 135, 6º Bilbao Vizcaya 48133 \n", "195 Avenida Flagelo 80, 1º 6ª Bilbao Vizcaya 48266 \n", "196 Plaza Verbo 258, 2º 4ª Bilbao Vizcaya 48660 \n", "\n", " Comunidad Movil Telefono Correo \\\n", "0 País Vasco 684 680 588 945 796 001 afernandez@correo.es \n", "1 País Vasco 625265233,0 NaN josehernandez@mail.net \n", "2 País Vasco 671737062,0 NaN juliaromero51@mail.es \n", "3 País Vasco NaN 945562191,0 raquel.puebla@correo.com \n", "193 País Vasco 642495170,0 NaN marramis62@email.net \n", "194 País Vasco 690747457,0 943785677,0 martinez44@mail.es \n", "195 País Vasco 608292972,0 NaN margarcia@email.com \n", "196 País Vasco NaN NaN rodriguez11@correo.es \n", "\n", " Nacimiento nuevacolumna \n", "0 07/09/1974 Hombre \n", "1 20/09/1937 Hombre \n", "2 28/02/1963 Mujer \n", "3 23/07/1981 Mujer \n", "193 06/10/1953 Mujer \n", "194 28/12/1972 Mujer \n", "195 22/08/1948 Mujer \n", "196 01/05/1945 Mujer " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "datosPorComunidad.get_group(\"País Vasco\")" ] }, { "cell_type": "code", "execution_count": 41, "id": "bb815a96-2e13-4e37-bb90-3a144350a536", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pandas.core.groupby.generic.DataFrameGroupBy" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Los datos segmentados están en formato dataframe\n", "type(datosPorComunidad)" ] }, { "cell_type": "markdown", "id": "2fa0f76f-2907-48dd-9b24-30ccb63bb720", "metadata": {}, "source": [ "Ahora que tenemos los datos segmentados, podríamos guardar las tablas de manera independiente." ] }, { "cell_type": "code", "execution_count": 42, "id": "6e859535-416c-47be-a71a-5bd3acd5b6c2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['País Vasco', 'Castilla La Mancha', 'Valencia', 'Andalucía',\n", " 'Castilla y León', 'Extremadura', 'Baleares', 'Cataluña',\n", " 'Galicia', 'Madrid', 'Murcia', 'Asturias', 'Canarias', 'Cantabria',\n", " 'Aragón'], dtype=object)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "comunidades = cliente.Comunidad.unique()\n", "comunidades" ] }, { "cell_type": "code", "execution_count": 43, "id": "f30c10ca-7a3f-495e-af00-98d672a239c4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "En la comunidad País Vasco el total de mujeres es 2 y el de hombres es 6\n", "En la comunidad Castilla La Mancha el total de mujeres es 2 y el de hombres es 3\n", "En la comunidad Valencia el total de mujeres es 9 y el de hombres es 15\n", "En la comunidad Andalucía el total de mujeres es 14 y el de hombres es 35\n", "En la comunidad Castilla y León el total de mujeres es 4 y el de hombres es 4\n", "En la comunidad Baleares el total de mujeres es 3 y el de hombres es 3\n", "En la comunidad Cataluña el total de mujeres es 14 y el de hombres es 22\n", "En la comunidad Galicia el total de mujeres es 6 y el de hombres es 6\n", "En la comunidad Madrid el total de mujeres es 13 y el de hombres es 18\n", "En la comunidad Murcia el total de mujeres es 3 y el de hombres es 3\n", "En la comunidad Canarias el total de mujeres es 3 y el de hombres es 4\n", "En la comunidad Cantabria el total de mujeres es 1 y el de hombres es 1\n", "En la comunidad Aragón el total de mujeres es 1 y el de hombres es 2\n" ] } ], "source": [ "for a in comunidades:\n", " try:\n", " tmp = datosPorComunidad.get_group(a)\n", " mujeres = tmp['Sexo'].value_counts()[1]\n", " hombres = tmp['Sexo'].value_counts()[0]\n", " print (\"En la comunidad \", a, \" el total de mujeres es \", mujeres, \" y el de hombres es \", hombres)\n", " nombre = './data/' + 'datos '+ a + '.csv'\n", " tmp.to_csv(nombre, index=False)\n", " except:\n", " continue" ] }, { "cell_type": "markdown", "id": "107c2e0f-2186-4497-bb33-d5fc35010b09", "metadata": {}, "source": [ "## Transformaciones por columnas" ] }, { "cell_type": "code", "execution_count": 44, "id": "2c9b4799-ff26-49e7-80b7-f838ceb79589", "metadata": {}, "outputs": [], "source": [ "cliente = pd.read_csv('./data/dimension_cliente.csv', sep='\\t', encoding='UTF-8')" ] }, { "cell_type": "code", "execution_count": 45, "id": "bab2cdab-4781-465d-96f1-7e08a2f8355e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idClienteNIFNombreApellido1Apellido2SexoDireccionMunicipioProvinciaCPComunidadMovilTelefonoCorreoNacimientoApellidos
018899616EAdriánFernándezReyHPlaza Septentrional Verde 149, 5VitoriaÁlava1440País Vasco684 680 588945 796 001afernandez@correo.es07/09/1974Fernández, Rey
1243238429 QJoséHernándezDomínguezHPaseo de Estratega Joven 57, 1ºVitoriaÁlava1271País Vasco625265233,0NaNjosehernandez@mail.net20/09/1937Hernández, Domínguez
2378747399JuliaRomeroTenaMAvenida Ocasión 83, 1º 5ªVitoriaÁlava1606País Vasco671737062,0NaNjuliaromero51@mail.es28/02/1963Romero, Tena
3491929848kRaquelPueblaÁlvarezMCalle Trapezoide Prolijo 84, 3ºVitoriaÁlava1205País VascoNaN945562191,0raquel.puebla@correo.com23/07/1981Puebla, Álvarez
4524485203-RAlbertoGomisVázquezHPaseo Hemograma 233, 6AlmansaAlbacete2552Castilla La ManchaNaNNaNagomis@mail.es04/04/1956Gomis, Vázquez
\n", "
" ], "text/plain": [ " idCliente NIF Nombre Apellido1 Apellido2 Sexo \\\n", "0 1 8899616E Adrián Fernández Rey H \n", "1 2 43238429 Q José Hernández Domínguez H \n", "2 3 78747399 Julia Romero Tena M \n", "3 4 91929848k Raquel Puebla Álvarez M \n", "4 5 24485203-R Alberto Gomis Vázquez H \n", "\n", " Direccion Municipio Provincia CP \\\n", "0 Plaza Septentrional Verde 149, 5 Vitoria Álava 1440 \n", "1 Paseo de Estratega Joven 57, 1º Vitoria Álava 1271 \n", "2 Avenida Ocasión 83, 1º 5ª Vitoria Álava 1606 \n", "3 Calle Trapezoide Prolijo 84, 3º Vitoria Álava 1205 \n", "4 Paseo Hemograma 233, 6 Almansa Albacete 2552 \n", "\n", " Comunidad Movil Telefono Correo \\\n", "0 País Vasco 684 680 588 945 796 001 afernandez@correo.es \n", "1 País Vasco 625265233,0 NaN josehernandez@mail.net \n", "2 País Vasco 671737062,0 NaN juliaromero51@mail.es \n", "3 País Vasco NaN 945562191,0 raquel.puebla@correo.com \n", "4 Castilla La Mancha NaN NaN agomis@mail.es \n", "\n", " Nacimiento Apellidos \n", "0 07/09/1974 Fernández, Rey \n", "1 20/09/1937 Hernández, Domínguez \n", "2 28/02/1963 Romero, Tena \n", "3 23/07/1981 Puebla, Álvarez \n", "4 04/04/1956 Gomis, Vázquez " ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Unimos columnas\n", "cliente['Apellidos'] = cliente['Apellido1'] + \", \" + cliente ['Apellido2']\n", "cliente.head()" ] }, { "cell_type": "code", "execution_count": 46, "id": "0552a1af-34a6-4e79-84e0-9301708c7a6f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['idCliente',\n", " 'NIF',\n", " 'Nombre',\n", " 'Apellido1',\n", " 'Apellido2',\n", " 'Apellidos',\n", " 'Sexo',\n", " 'Direccion',\n", " 'Municipio',\n", " 'Provincia',\n", " 'CP',\n", " 'Comunidad',\n", " 'Movil',\n", " 'Telefono',\n", " 'Correo',\n", " 'Nacimiento']" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Reordenamos las columnas (colocamos Apellidos junto a Apellido1 y Apellido2)\n", "columnas = list(cliente.columns)\n", "orden = columnas [0:5] + [columnas [-1]] + columnas [5:-1] # Ojo, el segundo elemento de la lista es un str por eso lo convierto en lista antes de añadirlo a orden\n", "orden" ] }, { "cell_type": "code", "execution_count": 47, "id": "56136f1d-055f-4487-b132-bfed192fb881", "metadata": {}, "outputs": [], "source": [ "cliente = cliente.reindex(columns = orden)" ] }, { "cell_type": "code", "execution_count": 48, "id": "941e1df4-b3b3-456a-abc1-1bf7a592802c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idClienteNIFNombreApellido1Apellido2ApellidosSexoDireccionMunicipioProvinciaCPComunidadMovilTelefonoCorreoNacimiento
018899616EAdriánFernándezReyFernández, ReyHPlaza Septentrional Verde 149, 5VitoriaÁlava1440País Vasco684 680 588945 796 001afernandez@correo.es07/09/1974
1243238429 QJoséHernándezDomínguezHernández, DomínguezHPaseo de Estratega Joven 57, 1ºVitoriaÁlava1271País Vasco625265233,0NaNjosehernandez@mail.net20/09/1937
2378747399JuliaRomeroTenaRomero, TenaMAvenida Ocasión 83, 1º 5ªVitoriaÁlava1606País Vasco671737062,0NaNjuliaromero51@mail.es28/02/1963
3491929848kRaquelPueblaÁlvarezPuebla, ÁlvarezMCalle Trapezoide Prolijo 84, 3ºVitoriaÁlava1205País VascoNaN945562191,0raquel.puebla@correo.com23/07/1981
4524485203-RAlbertoGomisVázquezGomis, VázquezHPaseo Hemograma 233, 6AlmansaAlbacete2552Castilla La ManchaNaNNaNagomis@mail.es04/04/1956
\n", "
" ], "text/plain": [ " idCliente NIF Nombre Apellido1 Apellido2 Apellidos \\\n", "0 1 8899616E Adrián Fernández Rey Fernández, Rey \n", "1 2 43238429 Q José Hernández Domínguez Hernández, Domínguez \n", "2 3 78747399 Julia Romero Tena Romero, Tena \n", "3 4 91929848k Raquel Puebla Álvarez Puebla, Álvarez \n", "4 5 24485203-R Alberto Gomis Vázquez Gomis, Vázquez \n", "\n", " Sexo Direccion Municipio Provincia CP \\\n", "0 H Plaza Septentrional Verde 149, 5 Vitoria Álava 1440 \n", "1 H Paseo de Estratega Joven 57, 1º Vitoria Álava 1271 \n", "2 M Avenida Ocasión 83, 1º 5ª Vitoria Álava 1606 \n", "3 M Calle Trapezoide Prolijo 84, 3º Vitoria Álava 1205 \n", "4 H Paseo Hemograma 233, 6 Almansa Albacete 2552 \n", "\n", " Comunidad Movil Telefono Correo \\\n", "0 País Vasco 684 680 588 945 796 001 afernandez@correo.es \n", "1 País Vasco 625265233,0 NaN josehernandez@mail.net \n", "2 País Vasco 671737062,0 NaN juliaromero51@mail.es \n", "3 País Vasco NaN 945562191,0 raquel.puebla@correo.com \n", "4 Castilla La Mancha NaN NaN agomis@mail.es \n", "\n", " Nacimiento \n", "0 07/09/1974 \n", "1 20/09/1937 \n", "2 28/02/1963 \n", "3 23/07/1981 \n", "4 04/04/1956 " ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cliente.head()" ] }, { "cell_type": "code", "execution_count": 49, "id": "2f593232-c35a-4a45-a0ca-60129c036bff", "metadata": {}, "outputs": [], "source": [ "# Dividimos el campo mail a través de la arroba\n", "cliente['Nick'] = cliente ['Correo'].str.split('@', expand=True)[0]\n", "cliente['Dominio'] = cliente ['Correo'].str.split('@', expand=True)[1]" ] }, { "cell_type": "code", "execution_count": 50, "id": "5954a60e-54e2-4048-9e58-c5e4c1b46e67", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idClienteNIFNombreApellido1Apellido2ApellidosSexoDireccionMunicipioProvinciaCPComunidadMovilTelefonoCorreoNacimientoNickDominio
018899616EAdriánFernándezReyFernández, ReyHPlaza Septentrional Verde 149, 5VitoriaÁlava1440País Vasco684 680 588945 796 001afernandez@correo.es07/09/1974afernandezcorreo.es
1243238429 QJoséHernándezDomínguezHernández, DomínguezHPaseo de Estratega Joven 57, 1ºVitoriaÁlava1271País Vasco625265233,0NaNjosehernandez@mail.net20/09/1937josehernandezmail.net
2378747399JuliaRomeroTenaRomero, TenaMAvenida Ocasión 83, 1º 5ªVitoriaÁlava1606País Vasco671737062,0NaNjuliaromero51@mail.es28/02/1963juliaromero51mail.es
3491929848kRaquelPueblaÁlvarezPuebla, ÁlvarezMCalle Trapezoide Prolijo 84, 3ºVitoriaÁlava1205País VascoNaN945562191,0raquel.puebla@correo.com23/07/1981raquel.pueblacorreo.com
4524485203-RAlbertoGomisVázquezGomis, VázquezHPaseo Hemograma 233, 6AlmansaAlbacete2552Castilla La ManchaNaNNaNagomis@mail.es04/04/1956agomismail.es
\n", "
" ], "text/plain": [ " idCliente NIF Nombre Apellido1 Apellido2 Apellidos \\\n", "0 1 8899616E Adrián Fernández Rey Fernández, Rey \n", "1 2 43238429 Q José Hernández Domínguez Hernández, Domínguez \n", "2 3 78747399 Julia Romero Tena Romero, Tena \n", "3 4 91929848k Raquel Puebla Álvarez Puebla, Álvarez \n", "4 5 24485203-R Alberto Gomis Vázquez Gomis, Vázquez \n", "\n", " Sexo Direccion Municipio Provincia CP \\\n", "0 H Plaza Septentrional Verde 149, 5 Vitoria Álava 1440 \n", "1 H Paseo de Estratega Joven 57, 1º Vitoria Álava 1271 \n", "2 M Avenida Ocasión 83, 1º 5ª Vitoria Álava 1606 \n", "3 M Calle Trapezoide Prolijo 84, 3º Vitoria Álava 1205 \n", "4 H Paseo Hemograma 233, 6 Almansa Albacete 2552 \n", "\n", " Comunidad Movil Telefono Correo \\\n", "0 País Vasco 684 680 588 945 796 001 afernandez@correo.es \n", "1 País Vasco 625265233,0 NaN josehernandez@mail.net \n", "2 País Vasco 671737062,0 NaN juliaromero51@mail.es \n", "3 País Vasco NaN 945562191,0 raquel.puebla@correo.com \n", "4 Castilla La Mancha NaN NaN agomis@mail.es \n", "\n", " Nacimiento Nick Dominio \n", "0 07/09/1974 afernandez correo.es \n", "1 20/09/1937 josehernandez mail.net \n", "2 28/02/1963 juliaromero51 mail.es \n", "3 23/07/1981 raquel.puebla correo.com \n", "4 04/04/1956 agomis mail.es " ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cliente.head()" ] }, { "cell_type": "code", "execution_count": 51, "id": "c99e731d-11a9-4bd8-8be3-47947910e5f2", "metadata": {}, "outputs": [], "source": [ "# Vuelvo a reordenar columnas...\n", "columnas = list(cliente.columns)\n", "orden = columnas [0:6] + columnas [-2:] + columnas [6:-2]\n", "cliente = cliente.reindex(columns = orden)" ] }, { "cell_type": "code", "execution_count": 52, "id": "18e07dde-83fa-4123-b283-3bc5f5bd61e7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idClienteNIFNombreApellido1Apellido2ApellidosNickDominioSexoDireccionMunicipioProvinciaCPComunidadMovilTelefonoCorreoNacimiento
018899616EAdriánFernándezReyFernández, Reyafernandezcorreo.esHPlaza Septentrional Verde 149, 5VitoriaÁlava1440País Vasco684 680 588945 796 001afernandez@correo.es07/09/1974
1243238429 QJoséHernándezDomínguezHernández, Domínguezjosehernandezmail.netHPaseo de Estratega Joven 57, 1ºVitoriaÁlava1271País Vasco625265233,0NaNjosehernandez@mail.net20/09/1937
2378747399JuliaRomeroTenaRomero, Tenajuliaromero51mail.esMAvenida Ocasión 83, 1º 5ªVitoriaÁlava1606País Vasco671737062,0NaNjuliaromero51@mail.es28/02/1963
3491929848kRaquelPueblaÁlvarezPuebla, Álvarezraquel.pueblacorreo.comMCalle Trapezoide Prolijo 84, 3ºVitoriaÁlava1205País VascoNaN945562191,0raquel.puebla@correo.com23/07/1981
4524485203-RAlbertoGomisVázquezGomis, Vázquezagomismail.esHPaseo Hemograma 233, 6AlmansaAlbacete2552Castilla La ManchaNaNNaNagomis@mail.es04/04/1956
\n", "
" ], "text/plain": [ " idCliente NIF Nombre Apellido1 Apellido2 Apellidos \\\n", "0 1 8899616E Adrián Fernández Rey Fernández, Rey \n", "1 2 43238429 Q José Hernández Domínguez Hernández, Domínguez \n", "2 3 78747399 Julia Romero Tena Romero, Tena \n", "3 4 91929848k Raquel Puebla Álvarez Puebla, Álvarez \n", "4 5 24485203-R Alberto Gomis Vázquez Gomis, Vázquez \n", "\n", " Nick Dominio Sexo Direccion \\\n", "0 afernandez correo.es H Plaza Septentrional Verde 149, 5 \n", "1 josehernandez mail.net H Paseo de Estratega Joven 57, 1º \n", "2 juliaromero51 mail.es M Avenida Ocasión 83, 1º 5ª \n", "3 raquel.puebla correo.com M Calle Trapezoide Prolijo 84, 3º \n", "4 agomis mail.es H Paseo Hemograma 233, 6 \n", "\n", " Municipio Provincia CP Comunidad Movil Telefono \\\n", "0 Vitoria Álava 1440 País Vasco 684 680 588 945 796 001 \n", "1 Vitoria Álava 1271 País Vasco 625265233,0 NaN \n", "2 Vitoria Álava 1606 País Vasco 671737062,0 NaN \n", "3 Vitoria Álava 1205 País Vasco NaN 945562191,0 \n", "4 Almansa Albacete 2552 Castilla La Mancha NaN NaN \n", "\n", " Correo Nacimiento \n", "0 afernandez@correo.es 07/09/1974 \n", "1 josehernandez@mail.net 20/09/1937 \n", "2 juliaromero51@mail.es 28/02/1963 \n", "3 raquel.puebla@correo.com 23/07/1981 \n", "4 agomis@mail.es 04/04/1956 " ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cliente.head()" ] }, { "cell_type": "markdown", "id": "248ab31b-f7dd-4e2f-bbe0-a3433189c666", "metadata": { "tags": [] }, "source": [ "## Expresiones regulares" ] }, { "cell_type": "code", "execution_count": 53, "id": "59d1888c-591d-464e-84db-2fd6f4478cb2", "metadata": {}, "outputs": [], "source": [ "cliente = pd.read_csv('./data/dimension_cliente.csv', sep='\\t', encoding='UTF-8')" ] }, { "cell_type": "code", "execution_count": 54, "id": "c59b9008-d3b3-4311-8119-a337d33e4855", "metadata": {}, "outputs": [], "source": [ "# Clientes cuyo NIF tiene un 2\n", "filtro = cliente['NIF'].str.contains('2')\n", "cliente ['TienenUn2'] = filtro" ] }, { "cell_type": "code", "execution_count": 55, "id": "c8ebb144-9b58-436c-b195-88debf256d29", "metadata": {}, "outputs": [], "source": [ "# Clientes cuyo NIF empieza por 2\n", "filtro = cliente['NIF'].str.contains('^2')\n", "cliente ['EmpiezaPor2'] = filtro" ] }, { "cell_type": "code", "execution_count": 56, "id": "15ed0762-75c8-43ed-8351-f5f9ec80ba5b", "metadata": {}, "outputs": [], "source": [ "# Clientes cuyo NIF tiene 9 elementos (longitud 9)\n", "filtro = cliente['NIF'].str.contains('\\w{9}')\n", "cliente ['9Elementos'] = filtro" ] }, { "cell_type": "code", "execution_count": 57, "id": "c8f639cc-ec75-4f62-9ab3-5bca93493d49", "metadata": {}, "outputs": [], "source": [ "# Clientes cuyo NIF tiene el formato correcto\n", "filtro = cliente['NIF'].str.contains('\\d{8}\\w{1}')\n", "cliente ['DniCorrecto'] = filtro" ] }, { "cell_type": "code", "execution_count": 58, "id": "f267cfcb-4629-42ac-82e4-53d565b64e47", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idClienteNIFNombreApellido1Apellido2SexoDireccionMunicipioProvinciaCPComunidadMovilTelefonoCorreoNacimientoTienenUn2EmpiezaPor29ElementosDniCorrecto
018899616EAdriánFernándezReyHPlaza Septentrional Verde 149, 5VitoriaÁlava1440País Vasco684 680 588945 796 001afernandez@correo.es07/09/1974FalseFalseFalseFalse
1243238429 QJoséHernándezDomínguezHPaseo de Estratega Joven 57, 1ºVitoriaÁlava1271País Vasco625265233,0NaNjosehernandez@mail.net20/09/1937TrueFalseFalseFalse
2378747399JuliaRomeroTenaMAvenida Ocasión 83, 1º 5ªVitoriaÁlava1606País Vasco671737062,0NaNjuliaromero51@mail.es28/02/1963FalseFalseFalseFalse
3491929848kRaquelPueblaÁlvarezMCalle Trapezoide Prolijo 84, 3ºVitoriaÁlava1205País VascoNaN945562191,0raquel.puebla@correo.com23/07/1981TrueFalseTrueTrue
4524485203-RAlbertoGomisVázquezHPaseo Hemograma 233, 6AlmansaAlbacete2552Castilla La ManchaNaNNaNagomis@mail.es04/04/1956TrueTrueFalseFalse
............................................................
19519688501155FMaría RosarioGarcíaRiberaMAvenida Flagelo 80, 1º 6ªBilbaoVizcaya48266País Vasco608292972,0NaNmargarcia@email.com22/08/1948FalseFalseTrueTrue
19619744525818APurificaciónRodríguezTorresMPlaza Verbo 258, 2º 4ªBilbaoVizcaya48660País VascoNaNNaNrodriguez11@correo.es01/05/1945TrueFalseTrueTrue
19719847694054NCarmenCasteloIbáñezMAvenida Tubérculo 163, 6º 1ªZaragozaZaragoza50832Aragón639 839 470NaNcarmencastelo@correo.net12/01/1940FalseFalseTrueTrue
19819940169914TMaría ÁngelesBenítezContrerasMCalle de Dibujo 207, 3º 4ªZaragozaZaragoza50366AragónNaNNaNmaribenitez62@mail.com22/11/1967FalseFalseTrueTrue
19920063551397MMiguelGuerreroLópezHPaseo Disciplina 245, 6ZaragozaZaragoza50624Aragón662-267-322976-569-481guerrero50@email.es30/10/1976FalseFalseTrueTrue
\n", "

200 rows × 19 columns

\n", "
" ], "text/plain": [ " idCliente NIF Nombre Apellido1 Apellido2 Sexo \\\n", "0 1 8899616E Adrián Fernández Rey H \n", "1 2 43238429 Q José Hernández Domínguez H \n", "2 3 78747399 Julia Romero Tena M \n", "3 4 91929848k Raquel Puebla Álvarez M \n", "4 5 24485203-R Alberto Gomis Vázquez H \n", ".. ... ... ... ... ... ... \n", "195 196 88501155F María Rosario García Ribera M \n", "196 197 44525818A Purificación Rodríguez Torres M \n", "197 198 47694054N Carmen Castelo Ibáñez M \n", "198 199 40169914T María Ángeles Benítez Contreras M \n", "199 200 63551397M Miguel Guerrero López H \n", "\n", " Direccion Municipio Provincia CP \\\n", "0 Plaza Septentrional Verde 149, 5 Vitoria Álava 1440 \n", "1 Paseo de Estratega Joven 57, 1º Vitoria Álava 1271 \n", "2 Avenida Ocasión 83, 1º 5ª Vitoria Álava 1606 \n", "3 Calle Trapezoide Prolijo 84, 3º Vitoria Álava 1205 \n", "4 Paseo Hemograma 233, 6 Almansa Albacete 2552 \n", ".. ... ... ... ... \n", "195 Avenida Flagelo 80, 1º 6ª Bilbao Vizcaya 48266 \n", "196 Plaza Verbo 258, 2º 4ª Bilbao Vizcaya 48660 \n", "197 Avenida Tubérculo 163, 6º 1ª Zaragoza Zaragoza 50832 \n", "198 Calle de Dibujo 207, 3º 4ª Zaragoza Zaragoza 50366 \n", "199 Paseo Disciplina 245, 6 Zaragoza Zaragoza 50624 \n", "\n", " Comunidad Movil Telefono Correo \\\n", "0 País Vasco 684 680 588 945 796 001 afernandez@correo.es \n", "1 País Vasco 625265233,0 NaN josehernandez@mail.net \n", "2 País Vasco 671737062,0 NaN juliaromero51@mail.es \n", "3 País Vasco NaN 945562191,0 raquel.puebla@correo.com \n", "4 Castilla La Mancha NaN NaN agomis@mail.es \n", ".. ... ... ... ... \n", "195 País Vasco 608292972,0 NaN margarcia@email.com \n", "196 País Vasco NaN NaN rodriguez11@correo.es \n", "197 Aragón 639 839 470 NaN carmencastelo@correo.net \n", "198 Aragón NaN NaN maribenitez62@mail.com \n", "199 Aragón 662-267-322 976-569-481 guerrero50@email.es \n", "\n", " Nacimiento TienenUn2 EmpiezaPor2 9Elementos DniCorrecto \n", "0 07/09/1974 False False False False \n", "1 20/09/1937 True False False False \n", "2 28/02/1963 False False False False \n", "3 23/07/1981 True False True True \n", "4 04/04/1956 True True False False \n", ".. ... ... ... ... ... \n", "195 22/08/1948 False False True True \n", "196 01/05/1945 True False True True \n", "197 12/01/1940 False False True True \n", "198 22/11/1967 False False True True \n", "199 30/10/1976 False False True True \n", "\n", "[200 rows x 19 columns]" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cliente" ] }, { "cell_type": "code", "execution_count": 59, "id": "3a02d1a8-8787-4aa3-9dc8-a7e3a4c6c791", "metadata": {}, "outputs": [], "source": [ "# Eliminamos espacios y - de la columna NIF y volvemos a comprobar si los NIF correctos \n", "cliente['NIF'] = cliente ['NIF'].str.replace(\"-\",\"\").str.replace(\" \",\"\")" ] }, { "cell_type": "code", "execution_count": 60, "id": "edc74cc0-255f-49b3-89e2-6c5ad99978d9", "metadata": {}, "outputs": [], "source": [ "# Clientes cuyo NIF tiene el formato correcto\n", "filtro = cliente['NIF'].str.contains('\\d{8}\\w{1}')\n", "cliente ['DniCorrecto2'] = filtro" ] }, { "cell_type": "code", "execution_count": 61, "id": "3a2b2372-63d7-4161-b73c-5b35a8fe2a12", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idClienteNIFNombreApellido1Apellido2SexoDireccionMunicipioProvinciaCPComunidadMovilTelefonoCorreoNacimientoTienenUn2EmpiezaPor29ElementosDniCorrectoDniCorrecto2
018899616EAdriánFernándezReyHPlaza Septentrional Verde 149, 5VitoriaÁlava1440País Vasco684 680 588945 796 001afernandez@correo.es07/09/1974FalseFalseFalseFalseFalse
1243238429QJoséHernándezDomínguezHPaseo de Estratega Joven 57, 1ºVitoriaÁlava1271País Vasco625265233,0NaNjosehernandez@mail.net20/09/1937TrueFalseFalseFalseTrue
2378747399JuliaRomeroTenaMAvenida Ocasión 83, 1º 5ªVitoriaÁlava1606País Vasco671737062,0NaNjuliaromero51@mail.es28/02/1963FalseFalseFalseFalseFalse
3491929848kRaquelPueblaÁlvarezMCalle Trapezoide Prolijo 84, 3ºVitoriaÁlava1205País VascoNaN945562191,0raquel.puebla@correo.com23/07/1981TrueFalseTrueTrueTrue
4524485203RAlbertoGomisVázquezHPaseo Hemograma 233, 6AlmansaAlbacete2552Castilla La ManchaNaNNaNagomis@mail.es04/04/1956TrueTrueFalseFalseTrue
...............................................................
19519688501155FMaría RosarioGarcíaRiberaMAvenida Flagelo 80, 1º 6ªBilbaoVizcaya48266País Vasco608292972,0NaNmargarcia@email.com22/08/1948FalseFalseTrueTrueTrue
19619744525818APurificaciónRodríguezTorresMPlaza Verbo 258, 2º 4ªBilbaoVizcaya48660País VascoNaNNaNrodriguez11@correo.es01/05/1945TrueFalseTrueTrueTrue
19719847694054NCarmenCasteloIbáñezMAvenida Tubérculo 163, 6º 1ªZaragozaZaragoza50832Aragón639 839 470NaNcarmencastelo@correo.net12/01/1940FalseFalseTrueTrueTrue
19819940169914TMaría ÁngelesBenítezContrerasMCalle de Dibujo 207, 3º 4ªZaragozaZaragoza50366AragónNaNNaNmaribenitez62@mail.com22/11/1967FalseFalseTrueTrueTrue
19920063551397MMiguelGuerreroLópezHPaseo Disciplina 245, 6ZaragozaZaragoza50624Aragón662-267-322976-569-481guerrero50@email.es30/10/1976FalseFalseTrueTrueTrue
\n", "

200 rows × 20 columns

\n", "
" ], "text/plain": [ " idCliente NIF Nombre Apellido1 Apellido2 Sexo \\\n", "0 1 8899616E Adrián Fernández Rey H \n", "1 2 43238429Q José Hernández Domínguez H \n", "2 3 78747399 Julia Romero Tena M \n", "3 4 91929848k Raquel Puebla Álvarez M \n", "4 5 24485203R Alberto Gomis Vázquez H \n", ".. ... ... ... ... ... ... \n", "195 196 88501155F María Rosario García Ribera M \n", "196 197 44525818A Purificación Rodríguez Torres M \n", "197 198 47694054N Carmen Castelo Ibáñez M \n", "198 199 40169914T María Ángeles Benítez Contreras M \n", "199 200 63551397M Miguel Guerrero López H \n", "\n", " Direccion Municipio Provincia CP \\\n", "0 Plaza Septentrional Verde 149, 5 Vitoria Álava 1440 \n", "1 Paseo de Estratega Joven 57, 1º Vitoria Álava 1271 \n", "2 Avenida Ocasión 83, 1º 5ª Vitoria Álava 1606 \n", "3 Calle Trapezoide Prolijo 84, 3º Vitoria Álava 1205 \n", "4 Paseo Hemograma 233, 6 Almansa Albacete 2552 \n", ".. ... ... ... ... \n", "195 Avenida Flagelo 80, 1º 6ª Bilbao Vizcaya 48266 \n", "196 Plaza Verbo 258, 2º 4ª Bilbao Vizcaya 48660 \n", "197 Avenida Tubérculo 163, 6º 1ª Zaragoza Zaragoza 50832 \n", "198 Calle de Dibujo 207, 3º 4ª Zaragoza Zaragoza 50366 \n", "199 Paseo Disciplina 245, 6 Zaragoza Zaragoza 50624 \n", "\n", " Comunidad Movil Telefono Correo \\\n", "0 País Vasco 684 680 588 945 796 001 afernandez@correo.es \n", "1 País Vasco 625265233,0 NaN josehernandez@mail.net \n", "2 País Vasco 671737062,0 NaN juliaromero51@mail.es \n", "3 País Vasco NaN 945562191,0 raquel.puebla@correo.com \n", "4 Castilla La Mancha NaN NaN agomis@mail.es \n", ".. ... ... ... ... \n", "195 País Vasco 608292972,0 NaN margarcia@email.com \n", "196 País Vasco NaN NaN rodriguez11@correo.es \n", "197 Aragón 639 839 470 NaN carmencastelo@correo.net \n", "198 Aragón NaN NaN maribenitez62@mail.com \n", "199 Aragón 662-267-322 976-569-481 guerrero50@email.es \n", "\n", " Nacimiento TienenUn2 EmpiezaPor2 9Elementos DniCorrecto DniCorrecto2 \n", "0 07/09/1974 False False False False False \n", "1 20/09/1937 True False False False True \n", "2 28/02/1963 False False False False False \n", "3 23/07/1981 True False True True True \n", "4 04/04/1956 True True False False True \n", ".. ... ... ... ... ... ... \n", "195 22/08/1948 False False True True True \n", "196 01/05/1945 True False True True True \n", "197 12/01/1940 False False True True True \n", "198 22/11/1967 False False True True True \n", "199 30/10/1976 False False True True True \n", "\n", "[200 rows x 20 columns]" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cliente" ] }, { "cell_type": "markdown", "id": "2858885a-6ae3-4906-aa6a-310fa43e9ebb", "metadata": {}, "source": [ "## El formato fecha" ] }, { "cell_type": "markdown", "id": "13ac2bfd-c112-4dad-82a9-3b76fd2863fd", "metadata": {}, "source": [ "#### Trabajaremos ahora con datos tipo fecha" ] }, { "cell_type": "code", "execution_count": 62, "id": "10b16910-0a59-474d-b453-4ea9dff93416", "metadata": {}, "outputs": [], "source": [ "from datetime import datetime" ] }, { "cell_type": "code", "execution_count": 63, "id": "86504433-0e62-42b8-b9ec-e2280628801c", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({'fecha original': ['1/8/2021 20:30:00',\n", " '15/9/2021 15:15:13',\n", " '3/10/2021 08:10:02', \n", " '3/11/2021 23:59:59', \n", " '3/12/2021 00:00:01'],\n", " 'valor': [6,5, 2, 3, 4]})\n" ] }, { "cell_type": "code", "execution_count": 64, "id": "04871304-964c-4a2b-bbc2-23e6603d00ac", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5 entries, 0 to 4\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 fecha original 5 non-null object\n", " 1 valor 5 non-null int64 \n", "dtypes: int64(1), object(1)\n", "memory usage: 208.0+ bytes\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 65, "id": "42b613e7-a7b4-4515-a65f-1f4fd9ce6922", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5 entries, 0 to 4\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 fecha original 5 non-null object \n", " 1 valor 5 non-null int64 \n", " 2 fecha1 5 non-null datetime64[ns]\n", "dtypes: datetime64[ns](1), int64(1), object(1)\n", "memory usage: 248.0+ bytes\n" ] } ], "source": [ "# Convertir string a datetime\n", "df['fecha1'] = pd.to_datetime(df['fecha original'])\n", "df.info()" ] }, { "cell_type": "markdown", "id": "d6e07bf8-97ab-4b1a-817e-ba69afac7d14", "metadata": {}, "source": [ "#### Por defecto Python parsea los datos con el mes primero (MM/DD, MM DD, MM-DD)(formato americano). Para utilizar el formato europeo, añadimos el argumento dayfirst." ] }, { "cell_type": "code", "execution_count": 66, "id": "928d4cad-94b0-43e4-be9a-2bb64b02100d", "metadata": {}, "outputs": [], "source": [ "# Convertir string a datetime 2\n", "df['fecha2'] = pd.to_datetime(df['fecha original'], dayfirst=True) # de esta manera procesamos Año - Mes - Día\n" ] }, { "cell_type": "code", "execution_count": 67, "id": "b3095b44-8504-4521-b6b1-18d1f06fd314", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fecha originalvalorfecha1fecha2
01/8/2021 20:30:0062021-01-08 20:30:002021-08-01 20:30:00
115/9/2021 15:15:1352021-09-15 15:15:132021-09-15 15:15:13
23/10/2021 08:10:0222021-03-10 08:10:022021-10-03 08:10:02
33/11/2021 23:59:5932021-03-11 23:59:592021-11-03 23:59:59
43/12/2021 00:00:0142021-03-12 00:00:012021-12-03 00:00:01
\n", "
" ], "text/plain": [ " fecha original valor fecha1 fecha2\n", "0 1/8/2021 20:30:00 6 2021-01-08 20:30:00 2021-08-01 20:30:00\n", "1 15/9/2021 15:15:13 5 2021-09-15 15:15:13 2021-09-15 15:15:13\n", "2 3/10/2021 08:10:02 2 2021-03-10 08:10:02 2021-10-03 08:10:02\n", "3 3/11/2021 23:59:59 3 2021-03-11 23:59:59 2021-11-03 23:59:59\n", "4 3/12/2021 00:00:01 4 2021-03-12 00:00:01 2021-12-03 00:00:01" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "id": "60de4222-f505-4892-bbaa-f294b8228c87", "metadata": {}, "source": [ "#### Usando máscaras nos saltamos el problema del formato para fecha que por defecto usa Python, y podemos especificarle " ] }, { "cell_type": "code", "execution_count": 68, "id": "f42b0ead-fe0b-4f6a-832c-7e3a6a7cc91c", "metadata": {}, "outputs": [], "source": [ "# Usando máscaras...\n", "df['fecha3'] = pd.to_datetime(df['fecha original'], format= '%d/%m/%Y %H:%M:%S')" ] }, { "cell_type": "code", "execution_count": 69, "id": "e34d9080-d22f-411a-aea0-b5476b114ace", "metadata": {}, "outputs": [], "source": [ "# Eliminar el timestamp...\n", "df['fecha4'] = pd.to_datetime(df['fecha original']).dt.date" ] }, { "cell_type": "code", "execution_count": 70, "id": "56c0c8dc-a761-498d-b095-8ad5ce557c5b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fecha originalvalorfecha1fecha2fecha3fecha4
01/8/2021 20:30:0062021-01-08 20:30:002021-08-01 20:30:002021-08-01 20:30:002021-01-08
115/9/2021 15:15:1352021-09-15 15:15:132021-09-15 15:15:132021-09-15 15:15:132021-09-15
23/10/2021 08:10:0222021-03-10 08:10:022021-10-03 08:10:022021-10-03 08:10:022021-03-10
33/11/2021 23:59:5932021-03-11 23:59:592021-11-03 23:59:592021-11-03 23:59:592021-03-11
43/12/2021 00:00:0142021-03-12 00:00:012021-12-03 00:00:012021-12-03 00:00:012021-03-12
\n", "
" ], "text/plain": [ " fecha original valor fecha1 fecha2 \\\n", "0 1/8/2021 20:30:00 6 2021-01-08 20:30:00 2021-08-01 20:30:00 \n", "1 15/9/2021 15:15:13 5 2021-09-15 15:15:13 2021-09-15 15:15:13 \n", "2 3/10/2021 08:10:02 2 2021-03-10 08:10:02 2021-10-03 08:10:02 \n", "3 3/11/2021 23:59:59 3 2021-03-11 23:59:59 2021-11-03 23:59:59 \n", "4 3/12/2021 00:00:01 4 2021-03-12 00:00:01 2021-12-03 00:00:01 \n", "\n", " fecha3 fecha4 \n", "0 2021-08-01 20:30:00 2021-01-08 \n", "1 2021-09-15 15:15:13 2021-09-15 \n", "2 2021-10-03 08:10:02 2021-03-10 \n", "3 2021-11-03 23:59:59 2021-03-11 \n", "4 2021-12-03 00:00:01 2021-03-12 " ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 71, "id": "826d363d-3a32-4ede-a11f-a92da24dca72", "metadata": {}, "outputs": [], "source": [ "# Extraer la hora de timestamp de los datos\n", "df['hora'] = pd.to_datetime(df['fecha original']).dt.time" ] }, { "cell_type": "code", "execution_count": 72, "id": "24e8d5c7-01d3-42a1-8066-15d49ad08565", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fecha originalvalorfecha1fecha2fecha3fecha4hora
01/8/2021 20:30:0062021-01-08 20:30:002021-08-01 20:30:002021-08-01 20:30:002021-01-0820:30:00
115/9/2021 15:15:1352021-09-15 15:15:132021-09-15 15:15:132021-09-15 15:15:132021-09-1515:15:13
23/10/2021 08:10:0222021-03-10 08:10:022021-10-03 08:10:022021-10-03 08:10:022021-03-1008:10:02
33/11/2021 23:59:5932021-03-11 23:59:592021-11-03 23:59:592021-11-03 23:59:592021-03-1123:59:59
43/12/2021 00:00:0142021-03-12 00:00:012021-12-03 00:00:012021-12-03 00:00:012021-03-1200:00:01
\n", "
" ], "text/plain": [ " fecha original valor fecha1 fecha2 \\\n", "0 1/8/2021 20:30:00 6 2021-01-08 20:30:00 2021-08-01 20:30:00 \n", "1 15/9/2021 15:15:13 5 2021-09-15 15:15:13 2021-09-15 15:15:13 \n", "2 3/10/2021 08:10:02 2 2021-03-10 08:10:02 2021-10-03 08:10:02 \n", "3 3/11/2021 23:59:59 3 2021-03-11 23:59:59 2021-11-03 23:59:59 \n", "4 3/12/2021 00:00:01 4 2021-03-12 00:00:01 2021-12-03 00:00:01 \n", "\n", " fecha3 fecha4 hora \n", "0 2021-08-01 20:30:00 2021-01-08 20:30:00 \n", "1 2021-09-15 15:15:13 2021-09-15 15:15:13 \n", "2 2021-10-03 08:10:02 2021-03-10 08:10:02 \n", "3 2021-11-03 23:59:59 2021-03-11 23:59:59 \n", "4 2021-12-03 00:00:01 2021-03-12 00:00:01 " ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "id": "d2a27f72-07c7-4166-86af-cc38e3452e49", "metadata": {}, "source": [ "#### Podemos pedirle a python que infiera el formato de fecha de los datos" ] }, { "cell_type": "code", "execution_count": 73, "id": "80bab888-4d5e-4479-a0ae-a925852c057d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 2021-01-08 20:30:00\n", "1 2021-09-15 15:15:13\n", "2 2021-03-10 08:10:02\n", "3 2021-03-11 23:59:59\n", "4 2021-03-12 00:00:01\n", "Name: fecha original, dtype: datetime64[ns]" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.to_datetime(df['fecha original'], infer_datetime_format=True)" ] }, { "cell_type": "markdown", "id": "eb5731a7-34dc-45fb-8c8b-dc528f9dee13", "metadata": {}, "source": [ "#### to_datetime() incluye un argumento para tratar los casos en los que se produce un error de conversión. Los valores posibles son, raise, coerce, ignore. Por defecto toma raise." ] }, { "cell_type": "code", "execution_count": 74, "id": "ec12f8ea-e749-4da8-aefa-6eb95b786339", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 2021-01-08 20:30:00\n", "1 2021-09-15 15:15:13\n", "2 2021-03-10 08:10:02\n", "3 2021-03-11 23:59:59\n", "4 2021-03-12 00:00:01\n", "Name: fecha original, dtype: datetime64[ns]" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.to_datetime(df['fecha original'], errors='ignore')" ] }, { "cell_type": "markdown", "id": "14844160-367a-49e4-a550-832122b07572", "metadata": {}, "source": [ "#### Construcción de una variable fecha a partir de otras columnas" ] }, { "cell_type": "code", "execution_count": 75, "id": "def19696-3276-4326-bc49-7df11da1bb00", "metadata": {}, "outputs": [], "source": [ "# Extraemos primero los datos de día, mes y año para poder reconstruirlos luego\n", "df['dia'] = df['fecha1'].dt.day\n", "df['mes'] = df['fecha1'].dt.month\n", "df['año'] = df['fecha1'].dt.year" ] }, { "cell_type": "code", "execution_count": 76, "id": "34b132bf-7c13-4501-8562-46ae84621463", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fecha originalvalorfecha1fecha2fecha3fecha4horadiamesaño
01/8/2021 20:30:0062021-01-08 20:30:002021-08-01 20:30:002021-08-01 20:30:002021-01-0820:30:00812021
115/9/2021 15:15:1352021-09-15 15:15:132021-09-15 15:15:132021-09-15 15:15:132021-09-1515:15:131592021
23/10/2021 08:10:0222021-03-10 08:10:022021-10-03 08:10:022021-10-03 08:10:022021-03-1008:10:021032021
33/11/2021 23:59:5932021-03-11 23:59:592021-11-03 23:59:592021-11-03 23:59:592021-03-1123:59:591132021
43/12/2021 00:00:0142021-03-12 00:00:012021-12-03 00:00:012021-12-03 00:00:012021-03-1200:00:011232021
\n", "
" ], "text/plain": [ " fecha original valor fecha1 fecha2 \\\n", "0 1/8/2021 20:30:00 6 2021-01-08 20:30:00 2021-08-01 20:30:00 \n", "1 15/9/2021 15:15:13 5 2021-09-15 15:15:13 2021-09-15 15:15:13 \n", "2 3/10/2021 08:10:02 2 2021-03-10 08:10:02 2021-10-03 08:10:02 \n", "3 3/11/2021 23:59:59 3 2021-03-11 23:59:59 2021-11-03 23:59:59 \n", "4 3/12/2021 00:00:01 4 2021-03-12 00:00:01 2021-12-03 00:00:01 \n", "\n", " fecha3 fecha4 hora dia mes año \n", "0 2021-08-01 20:30:00 2021-01-08 20:30:00 8 1 2021 \n", "1 2021-09-15 15:15:13 2021-09-15 15:15:13 15 9 2021 \n", "2 2021-10-03 08:10:02 2021-03-10 08:10:02 10 3 2021 \n", "3 2021-11-03 23:59:59 2021-03-11 23:59:59 11 3 2021 \n", "4 2021-12-03 00:00:01 2021-03-12 00:00:01 12 3 2021 " ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 77, "id": "a877ba2e-55db-434b-b174-62d660112f0b", "metadata": {}, "outputs": [], "source": [ "# Lo hacemos por pasos para ver el proceso\n", "construccionFecha = df.dia.astype('string') + '/' + df.mes.astype('string') + '/' + df['año'].astype('string') \n", "df['fecha5'] = pd.to_datetime(construccionFecha)\n" ] }, { "cell_type": "code", "execution_count": 78, "id": "e8578f33-2d88-4ed9-a339-bd781a47ea30", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 2021-08-01\n", "1 2021-09-15\n", "2 2021-10-03\n", "3 2021-11-03\n", "4 2021-12-03\n", "Name: fecha5, dtype: datetime64[ns]" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['fecha5']" ] }, { "cell_type": "markdown", "id": "f8ff96ae-575a-4770-9c7f-2a27fa9cff61", "metadata": {}, "source": [ "#### Otras opciones interesantes:" ] }, { "cell_type": "code", "execution_count": 79, "id": "96f7e46a-23c0-428a-9672-897086e2d19a", "metadata": {}, "outputs": [], "source": [ "# Extracción del número de semana\n", "df['numSemana'] = df['fecha1'].dt.week" ] }, { "cell_type": "code", "execution_count": 80, "id": "d1515e84-c34f-4ac8-aeb2-eb33a0d8b27b", "metadata": {}, "outputs": [], "source": [ "# Extracción del número de día de la semana\n", "df['diadelasemana'] = df['fecha1'].dt.dayofweek" ] }, { "cell_type": "code", "execution_count": 81, "id": "017d4d11-9ad8-4ed9-98f2-1af6204eb438", "metadata": {}, "outputs": [], "source": [ "# Comprobación de año bisiesto\n", "df['bisiesto'] = df['fecha1'].dt.is_leap_year" ] }, { "cell_type": "code", "execution_count": 82, "id": "419e51e7-1099-479b-bef1-2c1e364f9915", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fecha originalvalorfecha1fecha2fecha3fecha4horadiamesañofecha5numSemanadiadelasemanabisiesto
01/8/2021 20:30:0062021-01-08 20:30:002021-08-01 20:30:002021-08-01 20:30:002021-01-0820:30:008120212021-08-0114False
115/9/2021 15:15:1352021-09-15 15:15:132021-09-15 15:15:132021-09-15 15:15:132021-09-1515:15:1315920212021-09-15372False
23/10/2021 08:10:0222021-03-10 08:10:022021-10-03 08:10:022021-10-03 08:10:022021-03-1008:10:0210320212021-10-03102False
33/11/2021 23:59:5932021-03-11 23:59:592021-11-03 23:59:592021-11-03 23:59:592021-03-1123:59:5911320212021-11-03103False
43/12/2021 00:00:0142021-03-12 00:00:012021-12-03 00:00:012021-12-03 00:00:012021-03-1200:00:0112320212021-12-03104False
\n", "
" ], "text/plain": [ " fecha original valor fecha1 fecha2 \\\n", "0 1/8/2021 20:30:00 6 2021-01-08 20:30:00 2021-08-01 20:30:00 \n", "1 15/9/2021 15:15:13 5 2021-09-15 15:15:13 2021-09-15 15:15:13 \n", "2 3/10/2021 08:10:02 2 2021-03-10 08:10:02 2021-10-03 08:10:02 \n", "3 3/11/2021 23:59:59 3 2021-03-11 23:59:59 2021-11-03 23:59:59 \n", "4 3/12/2021 00:00:01 4 2021-03-12 00:00:01 2021-12-03 00:00:01 \n", "\n", " fecha3 fecha4 hora dia mes año fecha5 \\\n", "0 2021-08-01 20:30:00 2021-01-08 20:30:00 8 1 2021 2021-08-01 \n", "1 2021-09-15 15:15:13 2021-09-15 15:15:13 15 9 2021 2021-09-15 \n", "2 2021-10-03 08:10:02 2021-03-10 08:10:02 10 3 2021 2021-10-03 \n", "3 2021-11-03 23:59:59 2021-03-11 23:59:59 11 3 2021 2021-11-03 \n", "4 2021-12-03 00:00:01 2021-03-12 00:00:01 12 3 2021 2021-12-03 \n", "\n", " numSemana diadelasemana bisiesto \n", "0 1 4 False \n", "1 37 2 False \n", "2 10 2 False \n", "3 10 3 False \n", "4 10 4 False " ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 83, "id": "74a9b006-5b85-46f1-a6ff-5bb82d60c8db", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fecha originalvalorfecha1fecha2fecha3fecha4horadiamesañofecha5numSemanadiadelasemanabisiestonombrediadelasemana
01/8/2021 20:30:0062021-01-08 20:30:002021-08-01 20:30:002021-08-01 20:30:002021-01-0820:30:008120212021-08-0114FalseViernes
115/9/2021 15:15:1352021-09-15 15:15:132021-09-15 15:15:132021-09-15 15:15:132021-09-1515:15:1315920212021-09-15372FalseMiércoles
23/10/2021 08:10:0222021-03-10 08:10:022021-10-03 08:10:022021-10-03 08:10:022021-03-1008:10:0210320212021-10-03102FalseMiércoles
33/11/2021 23:59:5932021-03-11 23:59:592021-11-03 23:59:592021-11-03 23:59:592021-03-1123:59:5911320212021-11-03103FalseJueves
43/12/2021 00:00:0142021-03-12 00:00:012021-12-03 00:00:012021-12-03 00:00:012021-03-1200:00:0112320212021-12-03104FalseViernes
\n", "
" ], "text/plain": [ " fecha original valor fecha1 fecha2 \\\n", "0 1/8/2021 20:30:00 6 2021-01-08 20:30:00 2021-08-01 20:30:00 \n", "1 15/9/2021 15:15:13 5 2021-09-15 15:15:13 2021-09-15 15:15:13 \n", "2 3/10/2021 08:10:02 2 2021-03-10 08:10:02 2021-10-03 08:10:02 \n", "3 3/11/2021 23:59:59 3 2021-03-11 23:59:59 2021-11-03 23:59:59 \n", "4 3/12/2021 00:00:01 4 2021-03-12 00:00:01 2021-12-03 00:00:01 \n", "\n", " fecha3 fecha4 hora dia mes año fecha5 \\\n", "0 2021-08-01 20:30:00 2021-01-08 20:30:00 8 1 2021 2021-08-01 \n", "1 2021-09-15 15:15:13 2021-09-15 15:15:13 15 9 2021 2021-09-15 \n", "2 2021-10-03 08:10:02 2021-03-10 08:10:02 10 3 2021 2021-10-03 \n", "3 2021-11-03 23:59:59 2021-03-11 23:59:59 11 3 2021 2021-11-03 \n", "4 2021-12-03 00:00:01 2021-03-12 00:00:01 12 3 2021 2021-12-03 \n", "\n", " numSemana diadelasemana bisiesto nombrediadelasemana \n", "0 1 4 False Viernes \n", "1 37 2 False Miércoles \n", "2 10 2 False Miércoles \n", "3 10 3 False Jueves \n", "4 10 4 False Viernes " ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Ahora que tenemos el día de la semana, podemos 'mapear' ese valor a una etiqueta\n", "mapeodiassemana = {\n", " 0: 'Lunes',\n", " 1: 'Martes',\n", " 2: 'Miércoles',\n", " 3: 'Jueves',\n", " 4: 'Viernes',\n", " 5: 'Sábado',\n", " 6: 'Domingo'\n", "}\n", "df['nombrediadelasemana'] = df['diadelasemana'].map(mapeodiassemana)\n", "df" ] }, { "cell_type": "markdown", "id": "13422033-131c-407e-8814-c7b72906838f", "metadata": {}, "source": [ "#### Calcular los días desde una fecha hasta el día actual." ] }, { "cell_type": "code", "execution_count": 84, "id": "045d5bae-6414-49b4-b2a7-747ae890a731", "metadata": {}, "outputs": [], "source": [ "hoy = pd.to_datetime('today')" ] }, { "cell_type": "code", "execution_count": 85, "id": "61974bd5-2411-4fe1-9884-1cd33aca4a75", "metadata": {}, "outputs": [], "source": [ "df['totaldias'] = hoy - df['fecha1']" ] }, { "cell_type": "code", "execution_count": 86, "id": "04bc779b-f486-4c6f-abb5-ee44a34f4bbc", "metadata": {}, "outputs": [], "source": [ "# Si quisieramos ver por ejemplo la diferencia entre años\n", "df['totalaños'] = hoy.year - df['fecha1'].dt.year" ] }, { "cell_type": "code", "execution_count": 87, "id": "be8f63b0-ca32-4de3-924b-6e4dec9712d3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fecha originalvalorfecha1fecha2fecha3fecha4horadiamesañofecha5numSemanadiadelasemanabisiestonombrediadelasemanatotaldiastotalaños
01/8/2021 20:30:0062021-01-08 20:30:002021-08-01 20:30:002021-08-01 20:30:002021-01-0820:30:008120212021-08-0114FalseViernes326 days 20:48:00.3987400
115/9/2021 15:15:1352021-09-15 15:15:132021-09-15 15:15:132021-09-15 15:15:132021-09-1515:15:1315920212021-09-15372FalseMiércoles77 days 02:02:47.3987400
23/10/2021 08:10:0222021-03-10 08:10:022021-10-03 08:10:022021-10-03 08:10:022021-03-1008:10:0210320212021-10-03102FalseMiércoles266 days 09:07:58.3987400
33/11/2021 23:59:5932021-03-11 23:59:592021-11-03 23:59:592021-11-03 23:59:592021-03-1123:59:5911320212021-11-03103FalseJueves264 days 17:18:01.3987400
43/12/2021 00:00:0142021-03-12 00:00:012021-12-03 00:00:012021-12-03 00:00:012021-03-1200:00:0112320212021-12-03104FalseViernes264 days 17:17:59.3987400
\n", "
" ], "text/plain": [ " fecha original valor fecha1 fecha2 \\\n", "0 1/8/2021 20:30:00 6 2021-01-08 20:30:00 2021-08-01 20:30:00 \n", "1 15/9/2021 15:15:13 5 2021-09-15 15:15:13 2021-09-15 15:15:13 \n", "2 3/10/2021 08:10:02 2 2021-03-10 08:10:02 2021-10-03 08:10:02 \n", "3 3/11/2021 23:59:59 3 2021-03-11 23:59:59 2021-11-03 23:59:59 \n", "4 3/12/2021 00:00:01 4 2021-03-12 00:00:01 2021-12-03 00:00:01 \n", "\n", " fecha3 fecha4 hora dia mes año fecha5 \\\n", "0 2021-08-01 20:30:00 2021-01-08 20:30:00 8 1 2021 2021-08-01 \n", "1 2021-09-15 15:15:13 2021-09-15 15:15:13 15 9 2021 2021-09-15 \n", "2 2021-10-03 08:10:02 2021-03-10 08:10:02 10 3 2021 2021-10-03 \n", "3 2021-11-03 23:59:59 2021-03-11 23:59:59 11 3 2021 2021-11-03 \n", "4 2021-12-03 00:00:01 2021-03-12 00:00:01 12 3 2021 2021-12-03 \n", "\n", " numSemana diadelasemana bisiesto nombrediadelasemana \\\n", "0 1 4 False Viernes \n", "1 37 2 False Miércoles \n", "2 10 2 False Miércoles \n", "3 10 3 False Jueves \n", "4 10 4 False Viernes \n", "\n", " totaldias totalaños \n", "0 326 days 20:48:00.398740 0 \n", "1 77 days 02:02:47.398740 0 \n", "2 266 days 09:07:58.398740 0 \n", "3 264 days 17:18:01.398740 0 \n", "4 264 days 17:17:59.398740 0 " ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "id": "8ae5af13-1889-4dc9-a4c6-78b36e01f9d5", "metadata": {}, "source": [ "#### Cuando trabajamos con datos con marca temporal, puede ser interesante usar la fecha como índice. De esta manera podremos trabajar con los datos de manera más eficiente." ] }, { "cell_type": "code", "execution_count": 132, "id": "803baf9c-5e15-4b0f-9590-c9c4020335f7", "metadata": {}, "outputs": [], "source": [ "df = df.set_index('fecha1')" ] }, { "cell_type": "code", "execution_count": 133, "id": "41ed53ab-c38a-4cfd-b92b-978fab59cfc5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fecha originalvalorfecha2fecha3fecha4horadiamesañofecha5numSemanadiadelasemanabisiestonombrediadelasemanatotaldiastotalaños
fecha1
2021-01-08 20:30:001/8/2021 20:30:0062021-08-01 20:30:002021-08-01 20:30:002021-01-0820:30:008120212021-08-0114FalseViernes326 days 20:48:00.3987400
2021-09-15 15:15:1315/9/2021 15:15:1352021-09-15 15:15:132021-09-15 15:15:132021-09-1515:15:1315920212021-09-15372FalseMiércoles77 days 02:02:47.3987400
2021-03-10 08:10:023/10/2021 08:10:0222021-10-03 08:10:022021-10-03 08:10:022021-03-1008:10:0210320212021-10-03102FalseMiércoles266 days 09:07:58.3987400
2021-03-11 23:59:593/11/2021 23:59:5932021-11-03 23:59:592021-11-03 23:59:592021-03-1123:59:5911320212021-11-03103FalseJueves264 days 17:18:01.3987400
2021-03-12 00:00:013/12/2021 00:00:0142021-12-03 00:00:012021-12-03 00:00:012021-03-1200:00:0112320212021-12-03104FalseViernes264 days 17:17:59.3987400
\n", "
" ], "text/plain": [ " fecha original valor fecha2 \\\n", "fecha1 \n", "2021-01-08 20:30:00 1/8/2021 20:30:00 6 2021-08-01 20:30:00 \n", "2021-09-15 15:15:13 15/9/2021 15:15:13 5 2021-09-15 15:15:13 \n", "2021-03-10 08:10:02 3/10/2021 08:10:02 2 2021-10-03 08:10:02 \n", "2021-03-11 23:59:59 3/11/2021 23:59:59 3 2021-11-03 23:59:59 \n", "2021-03-12 00:00:01 3/12/2021 00:00:01 4 2021-12-03 00:00:01 \n", "\n", " fecha3 fecha4 hora dia mes año \\\n", "fecha1 \n", "2021-01-08 20:30:00 2021-08-01 20:30:00 2021-01-08 20:30:00 8 1 2021 \n", "2021-09-15 15:15:13 2021-09-15 15:15:13 2021-09-15 15:15:13 15 9 2021 \n", "2021-03-10 08:10:02 2021-10-03 08:10:02 2021-03-10 08:10:02 10 3 2021 \n", "2021-03-11 23:59:59 2021-11-03 23:59:59 2021-03-11 23:59:59 11 3 2021 \n", "2021-03-12 00:00:01 2021-12-03 00:00:01 2021-03-12 00:00:01 12 3 2021 \n", "\n", " fecha5 numSemana diadelasemana bisiesto \\\n", "fecha1 \n", "2021-01-08 20:30:00 2021-08-01 1 4 False \n", "2021-09-15 15:15:13 2021-09-15 37 2 False \n", "2021-03-10 08:10:02 2021-10-03 10 2 False \n", "2021-03-11 23:59:59 2021-11-03 10 3 False \n", "2021-03-12 00:00:01 2021-12-03 10 4 False \n", "\n", " nombrediadelasemana totaldias totalaños \n", "fecha1 \n", "2021-01-08 20:30:00 Viernes 326 days 20:48:00.398740 0 \n", "2021-09-15 15:15:13 Miércoles 77 days 02:02:47.398740 0 \n", "2021-03-10 08:10:02 Miércoles 266 days 09:07:58.398740 0 \n", "2021-03-11 23:59:59 Jueves 264 days 17:18:01.398740 0 \n", "2021-03-12 00:00:01 Viernes 264 days 17:17:59.398740 0 " ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 134, "id": "0ae3483a-f584-4f8e-a17d-099d7d0c78be", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fecha originalvalorfecha2fecha3fecha4horadiamesañofecha5numSemanadiadelasemanabisiestonombrediadelasemanatotaldiastotalaños
fecha1
2021-01-08 20:30:001/8/2021 20:30:0062021-08-01 20:30:002021-08-01 20:30:002021-01-0820:30:008120212021-08-0114FalseViernes326 days 20:48:00.3987400
2021-09-15 15:15:1315/9/2021 15:15:1352021-09-15 15:15:132021-09-15 15:15:132021-09-1515:15:1315920212021-09-15372FalseMiércoles77 days 02:02:47.3987400
2021-03-10 08:10:023/10/2021 08:10:0222021-10-03 08:10:022021-10-03 08:10:022021-03-1008:10:0210320212021-10-03102FalseMiércoles266 days 09:07:58.3987400
2021-03-11 23:59:593/11/2021 23:59:5932021-11-03 23:59:592021-11-03 23:59:592021-03-1123:59:5911320212021-11-03103FalseJueves264 days 17:18:01.3987400
2021-03-12 00:00:013/12/2021 00:00:0142021-12-03 00:00:012021-12-03 00:00:012021-03-1200:00:0112320212021-12-03104FalseViernes264 days 17:17:59.3987400
\n", "
" ], "text/plain": [ " fecha original valor fecha2 \\\n", "fecha1 \n", "2021-01-08 20:30:00 1/8/2021 20:30:00 6 2021-08-01 20:30:00 \n", "2021-09-15 15:15:13 15/9/2021 15:15:13 5 2021-09-15 15:15:13 \n", "2021-03-10 08:10:02 3/10/2021 08:10:02 2 2021-10-03 08:10:02 \n", "2021-03-11 23:59:59 3/11/2021 23:59:59 3 2021-11-03 23:59:59 \n", "2021-03-12 00:00:01 3/12/2021 00:00:01 4 2021-12-03 00:00:01 \n", "\n", " fecha3 fecha4 hora dia mes año \\\n", "fecha1 \n", "2021-01-08 20:30:00 2021-08-01 20:30:00 2021-01-08 20:30:00 8 1 2021 \n", "2021-09-15 15:15:13 2021-09-15 15:15:13 2021-09-15 15:15:13 15 9 2021 \n", "2021-03-10 08:10:02 2021-10-03 08:10:02 2021-03-10 08:10:02 10 3 2021 \n", "2021-03-11 23:59:59 2021-11-03 23:59:59 2021-03-11 23:59:59 11 3 2021 \n", "2021-03-12 00:00:01 2021-12-03 00:00:01 2021-03-12 00:00:01 12 3 2021 \n", "\n", " fecha5 numSemana diadelasemana bisiesto \\\n", "fecha1 \n", "2021-01-08 20:30:00 2021-08-01 1 4 False \n", "2021-09-15 15:15:13 2021-09-15 37 2 False \n", "2021-03-10 08:10:02 2021-10-03 10 2 False \n", "2021-03-11 23:59:59 2021-11-03 10 3 False \n", "2021-03-12 00:00:01 2021-12-03 10 4 False \n", "\n", " nombrediadelasemana totaldias totalaños \n", "fecha1 \n", "2021-01-08 20:30:00 Viernes 326 days 20:48:00.398740 0 \n", "2021-09-15 15:15:13 Miércoles 77 days 02:02:47.398740 0 \n", "2021-03-10 08:10:02 Miércoles 266 days 09:07:58.398740 0 \n", "2021-03-11 23:59:59 Jueves 264 days 17:18:01.398740 0 \n", "2021-03-12 00:00:01 Viernes 264 days 17:17:59.398740 0 " ] }, "execution_count": 134, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Si ahora quisiéramos obtener los datos del 2021\n", "df.loc ['2021']" ] }, { "cell_type": "code", "execution_count": 137, "id": "accb4665-2a44-4875-bc5c-75bc68661392", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fecha originalvalorfecha2fecha3fecha4horadiamesañofecha5numSemanadiadelasemanabisiestonombrediadelasemanatotaldiastotalaños
fecha1
2021-01-08 20:30:001/8/2021 20:30:0062021-08-01 20:30:002021-08-01 20:30:002021-01-0820:30:008120212021-08-0114FalseViernes326 days 20:48:00.3987400
\n", "
" ], "text/plain": [ " fecha original valor fecha2 \\\n", "fecha1 \n", "2021-01-08 20:30:00 1/8/2021 20:30:00 6 2021-08-01 20:30:00 \n", "\n", " fecha3 fecha4 hora dia mes año \\\n", "fecha1 \n", "2021-01-08 20:30:00 2021-08-01 20:30:00 2021-01-08 20:30:00 8 1 2021 \n", "\n", " fecha5 numSemana diadelasemana bisiesto \\\n", "fecha1 \n", "2021-01-08 20:30:00 2021-08-01 1 4 False \n", "\n", " nombrediadelasemana totaldias totalaños \n", "fecha1 \n", "2021-01-08 20:30:00 Viernes 326 days 20:48:00.398740 0 " ] }, "execution_count": 137, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc ['2021-01']" ] }, { "cell_type": "code", "execution_count": 147, "id": "e69ef1f5-7292-411a-9add-88bb3ae36f33", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fecha originalvalorfecha2fecha3fecha4horadiamesañofecha5numSemanadiadelasemanabisiestonombrediadelasemanatotaldiastotalaños
fecha1
2021-01-08 20:30:001/8/2021 20:30:0062021-08-01 20:30:002021-08-01 20:30:002021-01-0820:30:008120212021-08-0114FalseViernes326 days 20:48:00.3987400
2021-03-10 08:10:023/10/2021 08:10:0222021-10-03 08:10:022021-10-03 08:10:022021-03-1008:10:0210320212021-10-03102FalseMiércoles266 days 09:07:58.3987400
\n", "
" ], "text/plain": [ " fecha original valor fecha2 \\\n", "fecha1 \n", "2021-01-08 20:30:00 1/8/2021 20:30:00 6 2021-08-01 20:30:00 \n", "2021-03-10 08:10:02 3/10/2021 08:10:02 2 2021-10-03 08:10:02 \n", "\n", " fecha3 fecha4 hora dia mes año \\\n", "fecha1 \n", "2021-01-08 20:30:00 2021-08-01 20:30:00 2021-01-08 20:30:00 8 1 2021 \n", "2021-03-10 08:10:02 2021-10-03 08:10:02 2021-03-10 08:10:02 10 3 2021 \n", "\n", " fecha5 numSemana diadelasemana bisiesto \\\n", "fecha1 \n", "2021-01-08 20:30:00 2021-08-01 1 4 False \n", "2021-03-10 08:10:02 2021-10-03 10 2 False \n", "\n", " nombrediadelasemana totaldias totalaños \n", "fecha1 \n", "2021-01-08 20:30:00 Viernes 326 days 20:48:00.398740 0 \n", "2021-03-10 08:10:02 Miércoles 266 days 09:07:58.398740 0 " ] }, "execution_count": 147, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Podemos filtrar por rangos de fechas... aquí filtramos todos los registros del mes de enero 2021 entre el 1 y el 10 hasta el 3 de octubre de 2021\n", "df.loc ['2021-1 1 10':'2021-3-10']" ] }, { "cell_type": "markdown", "id": "8d8f048b-df37-45c5-91db-e7221c30ced6", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## Discretización de variables" ] }, { "cell_type": "code", "execution_count": 212, "id": "6622021e-d111-4937-a9af-848556fc6293", "metadata": { "tags": [] }, "outputs": [], "source": [ "clima = pd.read_csv('./data/clima.csv', sep=\";\", decimal=\",\", encoding='UTF-8')" ] }, { "cell_type": "code", "execution_count": 153, "id": "82183aac-3c19-4d33-8b37-eccd4ed5209c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fechatemperaturahumedadprecipitacionesviento
001/01/156.1064.000.0014.20
102/01/156.9066.000.0010.40
203/01/158.2071.000.009.80
304/01/1510.6079.000.008.00
405/01/159.0077.000.0011.10
..................
36027/12/1514.7041.000.0019.00
36128/12/1515.8052.004.3022.50
36229/12/1514.7050.000.0019.90
36330/12/1516.0052.000.5027.60
36431/12/1511.5083.002.608.90
\n", "

365 rows × 5 columns

\n", "
" ], "text/plain": [ " fecha temperatura humedad precipitaciones viento\n", "0 01/01/15 6.10 64.00 0.00 14.20\n", "1 02/01/15 6.90 66.00 0.00 10.40\n", "2 03/01/15 8.20 71.00 0.00 9.80\n", "3 04/01/15 10.60 79.00 0.00 8.00\n", "4 05/01/15 9.00 77.00 0.00 11.10\n", ".. ... ... ... ... ...\n", "360 27/12/15 14.70 41.00 0.00 19.00\n", "361 28/12/15 15.80 52.00 4.30 22.50\n", "362 29/12/15 14.70 50.00 0.00 19.90\n", "363 30/12/15 16.00 52.00 0.50 27.60\n", "364 31/12/15 11.50 83.00 2.60 8.90\n", "\n", "[365 rows x 5 columns]" ] }, "execution_count": 153, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clima" ] }, { "cell_type": "code", "execution_count": 213, "id": "742ec42f-205d-4c15-8e8d-943d27b08802", "metadata": {}, "outputs": [], "source": [ "# Simplifico los nombres de columna\n", "clima.columns = ['fecha','temperatura','humedad','precipitaciones','viento']" ] }, { "cell_type": "markdown", "id": "4c6b51cd-cd41-4bef-8043-95046f716f3d", "metadata": { "tags": [] }, "source": [ "### Las 3 maneras más habituales de segmentar son por: Intervalo, frecuencia y cluster.\n", "\n", "### **Segmentación por intervalo**\n", "### En la segmentación por intervalo, partimos el rango de datos en n partes iguales. El problema principal se encuentra en los Outliers. Si medimos salarios y resulta que el 99% de la gente tiene salarios sobre 1000€ y el 1% tiene el salario de Amancio Ortega. Los intervalos tendrán la misma amplitud, pero los outliers estarán en el mismo intervalo o valor.\n", "\n", "### **Segmentación por frecuencia.**\n", "### Cada segmento contiene la misma cantidad de datos. El rango de cada segmento es variable\n", "\n", "### **Segmentación por cluster**\n", "### Aquí se parte por clusters, busca en los datos cuantos grupos hay. La herramienta a usar es kmeans (K-Medias)" ] }, { "cell_type": "code", "execution_count": null, "id": "8c4a8783-f8f7-4bcc-8e18-ee551cc6c25a", "metadata": { "jupyter": { "source_hidden": true }, "tags": [] }, "outputs": [], "source": [ "from sklearn.preproccesing import KBin" ] }, { "cell_type": "code", "execution_count": 251, "id": "8ed1d888-0232-4cea-ba51-67d60884d224", "metadata": {}, "outputs": [], "source": [ "# Dicretización por intervalo: Discretizamos en 4 intervalos iguales (en rango de datos)\n", "clima['hum_int'] = pd.cut(clima['humedad'], 4)" ] }, { "cell_type": "code", "execution_count": 207, "id": "fb3d94a2-2de6-4232-874e-383f94c3f5e1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fechatemperaturahumedadprecipitacionesvientohum_intPrec_int2
001/01/156.1064.000.0014.20(50.0, 74.0]NaN
102/01/156.9066.000.0010.40(50.0, 74.0]NaN
203/01/158.2071.000.009.80(50.0, 74.0]NaN
304/01/1510.6079.000.008.00(74.0, 98.0]NaN
405/01/159.0077.000.0011.10(74.0, 98.0]NaN
........................
36027/12/1514.7041.000.0019.00(25.928, 50.0]NaN
36128/12/1515.8052.004.3022.50(50.0, 74.0](0.0, 26.0]
36229/12/1514.7050.000.0019.90(25.928, 50.0]NaN
36330/12/1516.0052.000.5027.60(50.0, 74.0](0.0, 26.0]
36431/12/1511.5083.002.608.90(74.0, 98.0](0.0, 26.0]
\n", "

365 rows × 7 columns

\n", "
" ], "text/plain": [ " fecha temperatura humedad precipitaciones viento hum_int \\\n", "0 01/01/15 6.10 64.00 0.00 14.20 (50.0, 74.0] \n", "1 02/01/15 6.90 66.00 0.00 10.40 (50.0, 74.0] \n", "2 03/01/15 8.20 71.00 0.00 9.80 (50.0, 74.0] \n", "3 04/01/15 10.60 79.00 0.00 8.00 (74.0, 98.0] \n", "4 05/01/15 9.00 77.00 0.00 11.10 (74.0, 98.0] \n", ".. ... ... ... ... ... ... \n", "360 27/12/15 14.70 41.00 0.00 19.00 (25.928, 50.0] \n", "361 28/12/15 15.80 52.00 4.30 22.50 (50.0, 74.0] \n", "362 29/12/15 14.70 50.00 0.00 19.90 (25.928, 50.0] \n", "363 30/12/15 16.00 52.00 0.50 27.60 (50.0, 74.0] \n", "364 31/12/15 11.50 83.00 2.60 8.90 (74.0, 98.0] \n", "\n", " Prec_int2 \n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", ".. ... \n", "360 NaN \n", "361 (0.0, 26.0] \n", "362 NaN \n", "363 (0.0, 26.0] \n", "364 (0.0, 26.0] \n", "\n", "[365 rows x 7 columns]" ] }, "execution_count": 207, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clima" ] }, { "cell_type": "code", "execution_count": 215, "id": "a76ec5e0-6080-40c9-b60b-04e3a1c5576d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 358.00\n", "mean 72.00\n", "std 12.01\n", "min 26.00\n", "25% 66.00\n", "50% 74.00\n", "75% 80.00\n", "max 98.00\n", "Name: humedad, dtype: float64" ] }, "execution_count": 215, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Un caso específico sería hacer una discretización en base a los intervalos que queramos. Me voy a basar en los cuartiles para ello.\n", "clima.humedad.describe()" ] }, { "cell_type": "code", "execution_count": 216, "id": "e6e9e301-94cc-467c-8d6a-c17656f4f28d", "metadata": {}, "outputs": [], "source": [ "clima['hum_int2'] = pd.cut(clima['humedad'], bins=[0,26,74,80,98])" ] }, { "cell_type": "code", "execution_count": 221, "id": "0b5213ab-33dc-424d-9949-d3446a398b22", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fechatemperaturahumedadprecipitacionesvientohum_inthum_int2
001/01/156.1064.000.0014.20(62.0, 80.0](26, 74]
102/01/156.9066.000.0010.40(62.0, 80.0](26, 74]
203/01/158.2071.000.009.80(62.0, 80.0](26, 74]
304/01/1510.6079.000.008.00(62.0, 80.0](74, 80]
405/01/159.0077.000.0011.10(62.0, 80.0](74, 80]
........................
36027/12/1514.7041.000.0019.00(25.928, 44.0](26, 74]
36128/12/1515.8052.004.3022.50(44.0, 62.0](26, 74]
36229/12/1514.7050.000.0019.90(44.0, 62.0](26, 74]
36330/12/1516.0052.000.5027.60(44.0, 62.0](26, 74]
36431/12/1511.5083.002.608.90(80.0, 98.0](80, 98]
\n", "

365 rows × 7 columns

\n", "
" ], "text/plain": [ " fecha temperatura humedad precipitaciones viento hum_int \\\n", "0 01/01/15 6.10 64.00 0.00 14.20 (62.0, 80.0] \n", "1 02/01/15 6.90 66.00 0.00 10.40 (62.0, 80.0] \n", "2 03/01/15 8.20 71.00 0.00 9.80 (62.0, 80.0] \n", "3 04/01/15 10.60 79.00 0.00 8.00 (62.0, 80.0] \n", "4 05/01/15 9.00 77.00 0.00 11.10 (62.0, 80.0] \n", ".. ... ... ... ... ... ... \n", "360 27/12/15 14.70 41.00 0.00 19.00 (25.928, 44.0] \n", "361 28/12/15 15.80 52.00 4.30 22.50 (44.0, 62.0] \n", "362 29/12/15 14.70 50.00 0.00 19.90 (44.0, 62.0] \n", "363 30/12/15 16.00 52.00 0.50 27.60 (44.0, 62.0] \n", "364 31/12/15 11.50 83.00 2.60 8.90 (80.0, 98.0] \n", "\n", " hum_int2 \n", "0 (26, 74] \n", "1 (26, 74] \n", "2 (26, 74] \n", "3 (74, 80] \n", "4 (74, 80] \n", ".. ... \n", "360 (26, 74] \n", "361 (26, 74] \n", "362 (26, 74] \n", "363 (26, 74] \n", "364 (80, 98] \n", "\n", "[365 rows x 7 columns]" ] }, "execution_count": 221, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clima" ] }, { "cell_type": "markdown", "id": "5ec99f53-4942-4a7e-b24b-050548ca1055", "metadata": {}, "source": [ "# Probad vosotros a discretizar la variable precipitaciones, usando los mismos cortes que en este ejemplo. ¿Qué sucede?\n", "Mirar la ayuda de pandas\n", "https://pandas.pydata.org/docs/reference/api/pandas.cut.html" ] }, { "cell_type": "code", "execution_count": 223, "id": "fe67ad74-df2c-4e10-91fc-b3474d026ab3", "metadata": {}, "outputs": [], "source": [ "# Incluso añadir etiquetas a los intervalos\n", "clima['hum_int2'] = pd.cut(clima['humedad'], bins=[0,26,74,80,98], include_lowest=True, labels=['1er Int','2do Int', '3er Int', '4o Int'])" ] }, { "cell_type": "code", "execution_count": 232, "id": "176a3a67-a195-491b-9e35-63186940e770", "metadata": {}, "outputs": [], "source": [ "# Dicretización por frecuencia: Discretizamos en 4 intervalos iguales (en rango de datos). Este caso coje los cuartiles que obtenemos en .describe. q puede coger el valor que queramos.\n", "clima['hum_frec'] = pd.qcut(clima['humedad'], q=4)" ] }, { "cell_type": "code", "execution_count": 231, "id": "69afdb9d-a2c4-47d8-a364-0b121933f3e7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fechatemperaturahumedadprecipitacionesvientohum_inthum_int2Prec_frechum_frec
001/01/156.1064.000.0014.20(62.0, 80.0]2do Int(-0.001, 2.2](25.999, 66.0]
102/01/156.9066.000.0010.40(62.0, 80.0]2do Int(-0.001, 2.2](25.999, 66.0]
203/01/158.2071.000.009.80(62.0, 80.0]2do Int(-0.001, 2.2](66.0, 74.0]
304/01/1510.6079.000.008.00(62.0, 80.0]3er Int(-0.001, 2.2](74.0, 80.0]
405/01/159.0077.000.0011.10(62.0, 80.0]3er Int(-0.001, 2.2](74.0, 80.0]
..............................
36027/12/1514.7041.000.0019.00(25.928, 44.0]2do Int(-0.001, 2.2](25.999, 66.0]
36128/12/1515.8052.004.3022.50(44.0, 62.0]2do Int(2.2, 77.2](25.999, 66.0]
36229/12/1514.7050.000.0019.90(44.0, 62.0]2do Int(-0.001, 2.2](25.999, 66.0]
36330/12/1516.0052.000.5027.60(44.0, 62.0]2do Int(-0.001, 2.2](25.999, 66.0]
36431/12/1511.5083.002.608.90(80.0, 98.0]4o Int(2.2, 77.2](80.0, 98.0]
\n", "

365 rows × 9 columns

\n", "
" ], "text/plain": [ " fecha temperatura humedad precipitaciones viento hum_int \\\n", "0 01/01/15 6.10 64.00 0.00 14.20 (62.0, 80.0] \n", "1 02/01/15 6.90 66.00 0.00 10.40 (62.0, 80.0] \n", "2 03/01/15 8.20 71.00 0.00 9.80 (62.0, 80.0] \n", "3 04/01/15 10.60 79.00 0.00 8.00 (62.0, 80.0] \n", "4 05/01/15 9.00 77.00 0.00 11.10 (62.0, 80.0] \n", ".. ... ... ... ... ... ... \n", "360 27/12/15 14.70 41.00 0.00 19.00 (25.928, 44.0] \n", "361 28/12/15 15.80 52.00 4.30 22.50 (44.0, 62.0] \n", "362 29/12/15 14.70 50.00 0.00 19.90 (44.0, 62.0] \n", "363 30/12/15 16.00 52.00 0.50 27.60 (44.0, 62.0] \n", "364 31/12/15 11.50 83.00 2.60 8.90 (80.0, 98.0] \n", "\n", " hum_int2 Prec_frec hum_frec \n", "0 2do Int (-0.001, 2.2] (25.999, 66.0] \n", "1 2do Int (-0.001, 2.2] (25.999, 66.0] \n", "2 2do Int (-0.001, 2.2] (66.0, 74.0] \n", "3 3er Int (-0.001, 2.2] (74.0, 80.0] \n", "4 3er Int (-0.001, 2.2] (74.0, 80.0] \n", ".. ... ... ... \n", "360 2do Int (-0.001, 2.2] (25.999, 66.0] \n", "361 2do Int (2.2, 77.2] (25.999, 66.0] \n", "362 2do Int (-0.001, 2.2] (25.999, 66.0] \n", "363 2do Int (-0.001, 2.2] (25.999, 66.0] \n", "364 4o Int (2.2, 77.2] (80.0, 98.0] \n", "\n", "[365 rows x 9 columns]" ] }, "execution_count": 231, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clima" ] }, { "cell_type": "code", "execution_count": 252, "id": "2acac65c-7a2f-4841-8e89-56206eedca4f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(62.0, 80.0] 204\n", "(80.0, 98.0] 87\n", "(44.0, 62.0] 57\n", "(25.928, 44.0] 10\n", "Name: hum_int, dtype: int64\n", "2do Int 179\n", "3er Int 91\n", "4o Int 87\n", "1er Int 1\n", "Name: hum_int2, dtype: int64\n", "(25.999, 66.0] 95\n", "(74.0, 80.0] 91\n", "(80.0, 98.0] 87\n", "(66.0, 74.0] 85\n", "Name: hum_frec, dtype: int64\n" ] } ], "source": [ "print (pd.value_counts(clima.hum_int))\n", "print (pd.value_counts(clima.hum_int2))\n", "print (pd.value_counts(clima.hum_frec))" ] }, { "cell_type": "code", "execution_count": null, "id": "4eaa2001-9c79-4ab9-ae7c-9341943a0054", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "2e3ede4c-6f01-4f28-ba8d-c787c46cead8", "metadata": {}, "source": [ "3 tipos de discretización:\n", "* Intervalo (igual ancho para todos los segmentos)\n", "* Frecuencia (igual frecuencia de aparición de elementos en los segmentos)\n", "* Cluster (discretización en base al algoritmo KMeans)" ] }, { "cell_type": "code", "execution_count": null, "id": "226d3cf8-ef66-45d5-99e3-e7f548a05ae6", "metadata": {}, "outputs": [], "source": [ "!pip install feature_engine" ] }, { "cell_type": "code", "execution_count": null, "id": "ee99f185-90c1-4257-af50-c5ed4e3a631c", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import KBinsDiscretizer\n", "from feature_engine.discretisation import EqualWidthDiscretiser" ] }, { "cell_type": "code", "execution_count": null, "id": "690ffcf5-e195-4bef-a1a1-5dd773f0f3ae", "metadata": {}, "outputs": [], "source": [ "clima = pd.read_csv('./data/clima.csv', sep=';', decimal=',')\n", "clima" ] }, { "cell_type": "code", "execution_count": null, "id": "4fc50ac5-cc8c-4f44-9a71-e07a7d650746", "metadata": {}, "outputs": [], "source": [ "intervalo = EqualWidthDiscretiser(bins=3, variables=['Precipitaciones'])" ] }, { "cell_type": "code", "execution_count": null, "id": "10302556-3cf1-45de-9ca4-88fef5b918b7", "metadata": {}, "outputs": [], "source": [ "intervalo.fit(clima)\n", "transformado = intervalo.transform(clima)" ] }, { "cell_type": "code", "execution_count": null, "id": "22fc0710-c43d-44e1-bfd2-e9a10b6a08c5", "metadata": {}, "outputs": [], "source": [ "transformado" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 5 }