# 1. IMPORTING PACKAGESimport pandas as pdimport matplotlib.pyplot as pltimport numpy as np# 2. READING DATA WITH PANDAS FROM GITHUB# GitHub URL for the diabetes data# Convert from GitHub web URL to raw data URLgithub_url ="https://raw.githubusercontent.com/cambiotraining/ml-unsupervised/main/course_files/data/diabetes_sample_data.csv"# Read CSV file directly from GitHubdiabetes_data = pd.read_csv(github_url)# Display basic information about the dataprint("\nData shape:", diabetes_data.shape)print("\nFirst 5 rows:")print(diabetes_data.head())print("\nBasic statistics:")print(diabetes_data.describe())# 3. PLOTTING WITH MATPLOTLIB# Plot 1: Histogram of Ageplt.figure()plt.hist(diabetes_data['age'], bins=20, alpha=0.7)plt.title('Distribution of Age', fontsize=14, fontweight='bold')plt.xlabel('Age')plt.ylabel('Frequency')plt.grid(True, alpha=0.3)plt.savefig('age_distribution.png', dpi=300)plt.show()# 4. NUMPYa = np.array([17, 13, 78, 901])print("This is a numpy array:")print(a)print("Here is the mean/average:")print( np.mean(a) )
import pandas as pd# Load the dataseturl ="https://raw.githubusercontent.com/cambiotraining/ml-unsupervised/refs/heads/main/course_files/data/USArrests.csv"crime_data = pd.read_csv(url)# Display the shape of the dataprint("Shape of the dataset:", crime_data.shape)# Display the first 3 rowsprint("\nFirst 3 rows:")print(crime_data.head(3))# Display column namesprint("\nColumn names:")print(crime_data.columns)
Shape of the dataset: (48, 5)
First 3 rows:
State Murder Assault UrbanPop Rape
0 Alabama 13.2 236 58 21.2
1 Alaska 10.0 263 48 44.5
2 Arizona 8.1 294 80 31.0
Column names:
Index(['State', 'Murder', 'Assault', 'UrbanPop', 'Rape'], dtype='object')
import pandas as pdimport matplotlib.pyplot as plt# Load the dataseturl ="https://raw.githubusercontent.com/cambiotraining/ml-unsupervised/refs/heads/main/course_files/data/USArrests.csv"crime_data = pd.read_csv(url)# Draw histogramplt.figure()plt.hist(crime_data["Murder"], bins=15)plt.grid(True)plt.xlabel("Murder")plt.ylabel("Frequency")plt.title("Distribution of Murder")plt.show()
import pandas as pdimport matplotlib.pyplot as plt# Load the dataseturl ="https://raw.githubusercontent.com/cambiotraining/ml-unsupervised/refs/heads/main/course_files/data/USArrests.csv"crime_data = pd.read_csv(url)# use groupbyprint(crime_data.groupby("State")["Murder"].mean().sort_values(ascending =False).head())# Alternative use idxmax() and loc()# arrests_data['Murder'].idxmax() → finds the index (row number) of the maximum value in the Murder column.# .loc[ ... , ['State', 'Murder']] → uses .loc[] to look up the row with the maximum murder rate and only show the State name and Murder value.print(crime_data.loc [crime_data["Murder"].idxmax(), ["State","Murder"] ])
State
Georgia 17.4
Mississippi 16.1
Florida 15.4
Louisiana 15.4
South Carolina 14.4
Name: Murder, dtype: float64
State Georgia
Murder 17.4
Name: 9, dtype: object
5.1.5 Optional exercise on Python
Exercise 5 - exercise_python_numpy
Level:
Fill in the blanks in the code below.
import numpy as np# 1) Reproducibilitynp.random.seed(7)# 2) Make a 5x4 array of random numbers in [0, 1)X = np.random.rand(5, 4)# 3) Compute:# - mean of each column# - mean of each row# - overall meancol_means = ...row_means = ...overall_mean = ...print("X:\n", X)print("Column means:", col_means)print("Row means:", row_means)print("Overall mean:", overall_mean)# 4) Bonus: random integers from 0..99 (size=12). Compare mean to 49.5ints = np.random.randint(0, 100, size=12)ints_mean = ...print("Random integers:", ints)print("Integers mean:", ints_mean)
import numpy as np# 1) Reproducibilitynp.random.seed(7)# 2) Make a 5x4 array of random numbers in [0, 1)X = np.random.rand(5, 4)# 3) Compute:# - mean of each column# - mean of each row# - overall meancol_means = X.mean(axis=0)row_means = X.mean(axis=1)overall_mean = X.mean()print("X:\n", X)print("Column means:", col_means)print("Row means:", row_means)print("Overall mean:", overall_mean)# 4) Bonus: random integers from 0..99 (size=12). Compare mean to 49.5ints = np.random.randint(0, 100, size=12)ints_mean = ints.mean()print("Random integers:", ints)print("Integers mean:", ints_mean)