{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入我们将要用到的库\n",
    "# sklearn.datasets提供了许多数据集接口, 我们可以直接调用加载数据\n",
    "from sklearn.datasets import fetch_california_housing\n",
    "# 为导入的库起别名, 方便后续调用\n",
    "import pandas as pd\n",
    " \n",
    "# 加载加州房价数据集\n",
    "california = fetch_california_housing()\n",
    "# 将数据集转换为Pandas DataFrame, 这样方便直接调用to_csv方法保存为csv文件\n",
    "data = pd.DataFrame(california.data, columns=california.feature_names)\n",
    "data['Price'] = california.target  # 添加目标值的数据\n",
    " \n",
    "# 保存数据到本地CSV文件\n",
    "data.to_csv('california_housing.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据集的基本信息：\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 20640 entries, 0 to 20639\n",
      "Data columns (total 9 columns):\n",
      " #   Column      Non-Null Count  Dtype  \n",
      "---  ------      --------------  -----  \n",
      " 0   MedInc      20640 non-null  float64\n",
      " 1   HouseAge    20640 non-null  float64\n",
      " 2   AveRooms    20640 non-null  float64\n",
      " 3   AveBedrms   20640 non-null  float64\n",
      " 4   Population  20640 non-null  float64\n",
      " 5   AveOccup    20640 non-null  float64\n",
      " 6   Latitude    20640 non-null  float64\n",
      " 7   Longitude   20640 non-null  float64\n",
      " 8   Price       20640 non-null  float64\n",
      "dtypes: float64(9)\n",
      "memory usage: 1.4 MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "# 从CSV文件加载刚刚保存的数据\n",
    "data = pd.read_csv('california_housing.csv')\n",
    " \n",
    "# 查看房价数据集的基本信息\n",
    "print(\"数据集的基本信息：\")\n",
    "print(data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "数据集的统计描述：\n",
      "             MedInc      HouseAge      AveRooms     AveBedrms    Population  \\\n",
      "count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   \n",
      "mean       3.870671     28.639486      5.429000      1.096675   1425.476744   \n",
      "std        1.899822     12.585558      2.474173      0.473911   1132.462122   \n",
      "min        0.499900      1.000000      0.846154      0.333333      3.000000   \n",
      "25%        2.563400     18.000000      4.440716      1.006079    787.000000   \n",
      "50%        3.534800     29.000000      5.229129      1.048780   1166.000000   \n",
      "75%        4.743250     37.000000      6.052381      1.099526   1725.000000   \n",
      "max       15.000100     52.000000    141.909091     34.066667  35682.000000   \n",
      "\n",
      "           AveOccup      Latitude     Longitude         Price  \n",
      "count  20640.000000  20640.000000  20640.000000  20640.000000  \n",
      "mean       3.070655     35.631861   -119.569704      2.068558  \n",
      "std       10.386050      2.135952      2.003532      1.153956  \n",
      "min        0.692308     32.540000   -124.350000      0.149990  \n",
      "25%        2.429741     33.930000   -121.800000      1.196000  \n",
      "50%        2.818116     34.260000   -118.490000      1.797000  \n",
      "75%        3.282261     37.710000   -118.010000      2.647250  \n",
      "max     1243.333333     41.950000   -114.310000      5.000010  \n"
     ]
    }
   ],
   "source": [
    "# 查看数据集的统计描述\n",
    "print(\"\\n数据集的统计描述：\")\n",
    "print(data.describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练集大小: (16512, 8)\n",
      "测试集大小: (4128, 8)\n"
     ]
    }
   ],
   "source": [
    "# 导入sklearn库中的数据分割函数\n",
    "from sklearn.model_selection import train_test_split\n",
    "# 导入sklearn中的标准化函数\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "# 分割特征和目标变量\n",
    "X = data.drop('Price', axis=1)  # 将Price列移除，axis=1表示移除Price这一列，而不是行\n",
    "y = data['Price']  # 目标变量\n",
    " \n",
    "# 新建标准化对象\n",
    "scaler = StandardScaler()\n",
    "# 调用标准化对象的方法对X标准化\n",
    "X = scaler.fit_transform(X)\n",
    " \n",
    "# 划分训练集和测试集, 必须传入原始的是 X特征数据和 y目标变量\n",
    "# test_size表示测试集占总数据集的比例, 这里选择0.2，表示训练集：测试集 = 4：1. 不写默认0.25.\n",
    "# random_state表示随机种子, 可以保证每次随机分割的结果是一样的, 保证实验的可复现性.\n",
    "# 设置为42是一个传统, 关于42的典故大家可以自行搜索. 宇宙的答案)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    " \n",
    "print(\"训练集大小:\", X_train.shape)\n",
    "print(\"测试集大小:\", X_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    " \n",
    "# np.c_ 表示按列连接两个矩阵，是np.concatenate()函数的简化形式，只能按列拼接\n",
    "# 等价于X_train = np.concatenate((X_train, np.ones(X_train.shape[0])), axis=1)\n",
    "# np.ones() 表示生成全1的数组，形状为X_train.shape[0]，即测试集的数据个数\n",
    "# 测试集同理\n",
    "# X_train = [[1, 2],   ==>   [[1, 2, 1],\n",
    "#            [3, 4],          [3, 4, 1],\n",
    "#            [5, 6]]          [5, 6, 1]]\n",
    "X_train = np.c_[X_train, np.ones(X_train.shape[0])]\n",
    "X_test = np.c_[X_test, np.ones(X_test.shape[0])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 下面的代码演示了利用矩阵乘法实现线性回归的过程, 注意偏置b的引入过程：\n",
    "# [[1, 2], * [2, 2].T + 3 = [9, 17].T\n",
    "#  [3, 4]\n",
    " \n",
    "# ==> 利用X和w的修改，实现偏置b的引入\n",
    "# [[1, 2, 1], * [2, 2, 3].T = [9, 17].T\n",
    "#  [3, 4, 1]]\n",
    "#  .T表示转置\n",
    "#  * 表示矩阵乘法\n",
    "# 初始化参数\n",
    "np.random.seed(42)\n",
    "w = np.random.randn(X_train.shape[1])  # 随机初始化权重，随机种子保证可以复现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 矩阵乘法实现线性回归\n",
    "y_pred = X_test.dot(w.T)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_loss(X, y, w):\n",
    "    m = len(y)  # 样本数量\n",
    "    y_pred = X.dot(w.T)  # 预测值\n",
    "    loss = (1 / (2 * m)) * np.sum((y_pred - y) ** 2)  # 均方误差\n",
    "    return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "def gradient_descent(X, y, w, learning_rate, num_epochs):\n",
    "    m = len(y)  # 样本数量\n",
    "    losses = []  # 记录损失值\n",
    " \n",
    "    for epoch in range(num_epochs):\n",
    "        y_pred = X.dot(w.T)  # 预测值\n",
    "        error = y_pred - y  # 误差\n",
    "        gradient = (1 / m) * X.T.dot(error)  # 计算梯度\n",
    "        w -= learning_rate * gradient  # 更新参数\n",
    " \n",
    "        # 记录损失值\n",
    "        loss = compute_loss(X, y, w)\n",
    "        losses.append(loss)\n",
    " \n",
    "        # 每100次迭代打印损失值\n",
    "        if (epoch + 1) % 100 == 0:\n",
    "            print(f\"Epoch {epoch + 1}, Loss: {loss}\")\n",
    " \n",
    "    return w, losses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 100, Loss: 5.217977410926638\n",
      "Epoch 200, Loss: 4.157294338348603\n",
      "Epoch 300, Loss: 3.3497577947958708\n",
      "Epoch 400, Loss: 2.73053630433568\n",
      "Epoch 500, Loss: 2.2524287775357092\n",
      "Epoch 600, Loss: 1.8808371972233242\n",
      "Epoch 700, Loss: 1.5902235413805874\n",
      "Epoch 800, Loss: 1.3616019775099957\n",
      "Epoch 900, Loss: 1.1807554474504072\n",
      "Epoch 1000, Loss: 1.0369608872306502\n",
      "Epoch 1100, Loss: 0.9220729667823013\n",
      "Epoch 1200, Loss: 0.8298616116306854\n",
      "Epoch 1300, Loss: 0.7555300018292121\n",
      "Epoch 1400, Loss: 0.6953615663364233\n",
      "Epoch 1500, Loss: 0.6464596797490852\n",
      "Epoch 1600, Loss: 0.6065543688819726\n",
      "Epoch 1700, Loss: 0.5738577577886365\n",
      "Epoch 1800, Loss: 0.5469551927147737\n",
      "Epoch 1900, Loss: 0.5247226639492798\n",
      "Epoch 2000, Loss: 0.5062637436836017\n",
      "Epoch 2100, Loss: 0.49086110954205375\n",
      "Epoch 2200, Loss: 0.477939045937819\n",
      "Epoch 2300, Loss: 0.46703426552508814\n",
      "Epoch 2400, Loss: 0.45777307947831664\n",
      "Epoch 2500, Loss: 0.4498534442859772\n",
      "Epoch 2600, Loss: 0.44303077771234745\n",
      "Epoch 2700, Loss: 0.43710670529934814\n",
      "Epoch 2800, Loss: 0.4319200979764123\n",
      "Epoch 2900, Loss: 0.42733991002630356\n",
      "Epoch 3000, Loss: 0.42325943839974506\n",
      "Epoch 3100, Loss: 0.41959170893575803\n",
      "Epoch 3200, Loss: 0.41626575947165734\n",
      "Epoch 3300, Loss: 0.4132236392332664\n",
      "Epoch 3400, Loss: 0.41041798202152213\n",
      "Epoch 3500, Loss: 0.4078100403067082\n",
      "Epoch 3600, Loss: 0.40536809044261857\n",
      "Epoch 3700, Loss: 0.40306613733806446\n",
      "Epoch 3800, Loss: 0.4008828612116066\n",
      "Epoch 3900, Loss: 0.39880076036811035\n",
      "Epoch 4000, Loss: 0.39680545292749725\n",
      "Epoch 4100, Loss: 0.394885107608247\n",
      "Epoch 4200, Loss: 0.39302997940703527\n",
      "Epoch 4300, Loss: 0.39123203062066747\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数\n",
    "learning_rate = 0.001\n",
    "num_epochs = 5000\n",
    "# 训练模型\n",
    "w, losses = gradient_descent(X_train, y_train, w, learning_rate, num_epochs)\n",
    "# 打印最终参数\n",
    "print(\"训练后的权重和偏置：\", w)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt \n",
    "# 解决画图无法显示中文\n",
    "plt.rcParams['font.sans-serif'] = ['SimHei']\n",
    "plt.rcParams['axes.unicode_minus'] = False\n",
    "# 绘制损失函数的变化曲线\n",
    "plt.figure(figsize=(8, 6))\n",
    "plt.plot(losses)\n",
    "plt.xlabel('迭代次数')\n",
    "plt.ylabel('损失')\n",
    "plt.title('损失曲线')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义线性回归模型\n",
    "def linear_regression(X, w):\n",
    "    return X.dot(w)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义线性回归模型\n",
    "def linear_regression(X, w):\n",
    "    return X.dot(w)\n",
    "# 在测试集上进行预测\n",
    "y_pred = linear_regression(X_test, w)\n",
    "print('测试集损失：', compute_loss(X_test, y_test, w))\n",
    "# 绘制真实值与预测值的对比图\n",
    "plt.rcParams['font.sans-serif'] = ['SimHei']\n",
    "plt.rcParams['axes.unicode_minus'] = False\n",
    "plt.figure(figsize=(8, 6))\n",
    "plt.scatter(y_test, y_pred, alpha=0.5, color='green')\n",
    "plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')\n",
    "plt.xlabel('真实价格')\n",
    "plt.ylabel('预测价格')\n",
    "plt.title('真实价格与预测价格对比')\n",
    "# 统一 x 轴和 y 轴的刻度范围\n",
    "plt.xlim(y_test.min(), y_test.max())\n",
    "plt.ylim(y_test.min(), y_test.max())\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
