QuickReference/public/posts/95.html

193 lines
48 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html><html lang="zh-CN" data-theme="light"><head><meta charset="UTF-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0,viewport-fit=cover"><title>决策树算法 | QuickReference</title><meta name="author" content="shenjianZ"><meta name="copyright" content="shenjianZ"><meta name="format-detection" content="telephone=no"><meta name="theme-color" content="#ffffff"><meta name="description" content="C4.5C4.5 是一种用于生成决策树的算法,不再使用信息增益,而是使用信息增益比,来避免偏向于选择取值较多的特征。信息增益比是信息增益与特征的熵的比值。 ID3D3 是一种基于信息增益Information Gain的决策树算法 CartCART分类与回归树一种决策树算法CART 使用 二叉树结构,即每个节点只能有两个子节点。 cart剪枝CART 决策树的剪枝方法分为 预剪枝Pr">
<meta property="og:type" content="article">
<meta property="og:title" content="决策树算法">
<meta property="og:url" content="https://rq.shenjianl.cn/posts/95.html">
<meta property="og:site_name" content="QuickReference">
<meta property="og:description" content="C4.5C4.5 是一种用于生成决策树的算法,不再使用信息增益,而是使用信息增益比,来避免偏向于选择取值较多的特征。信息增益比是信息增益与特征的熵的比值。 ID3D3 是一种基于信息增益Information Gain的决策树算法 CartCART分类与回归树一种决策树算法CART 使用 二叉树结构,即每个节点只能有两个子节点。 cart剪枝CART 决策树的剪枝方法分为 预剪枝Pr">
<meta property="og:locale" content="zh_CN">
<meta property="og:image" content="https://rq.shenjianl.cn/img/machinelearning/decision-tree.png">
<meta property="article:published_time" content="2025-01-24T04:39:59.000Z">
<meta property="article:modified_time" content="2025-02-09T02:26:52.799Z">
<meta property="article:author" content="shenjianZ">
<meta property="article:tag" content="decisiontree">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="https://rq.shenjianl.cn/img/machinelearning/decision-tree.png"><link rel="shortcut icon" href="/img/favicon.png"><link rel="canonical" href="https://rq.shenjianl.cn/posts/95.html"><link rel="preconnect" href="//cdn.jsdelivr.net"/><link rel="preconnect" href="//busuanzi.ibruce.info"/><link rel="stylesheet" href="/css/index.css?v=4.13.0"><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@6.5.1/css/all.min.css"><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fancyapps/ui@5.0.33/dist/fancybox/fancybox.min.css" media="print" onload="this.media='all'"><script>const GLOBAL_CONFIG = {
root: '/',
algolia: undefined,
localSearch: {"path":"/search.xml","preload":false,"top_n_per_article":1,"unescape":true,"languages":{"hits_empty":"找不到您查询的内容:${query}","hits_stats":"共找到 ${hits} 篇文章"}},
translate: undefined,
noticeOutdate: undefined,
highlight: {"plugin":"highlight.js","highlightCopy":true,"highlightLang":true,"highlightHeightLimit":false},
copy: {
success: '复制成功',
error: '复制错误',
noSupport: '浏览器不支持'
},
relativeDate: {
homepage: false,
post: false
},
runtime: '天',
dateSuffix: {
just: '刚刚',
min: '分钟前',
hour: '小时前',
day: '天前',
month: '个月前'
},
copyright: undefined,
lightbox: 'fancybox',
Snackbar: undefined,
infinitegrid: {
js: 'https://cdn.jsdelivr.net/npm/@egjs/infinitegrid@4.11.1/dist/infinitegrid.min.js',
buttonText: '加载更多'
},
isPhotoFigcaption: false,
islazyload: false,
isAnchor: false,
percent: {
toc: true,
rightside: false,
},
autoDarkmode: false
}</script><script id="config-diff">var GLOBAL_CONFIG_SITE = {
title: '决策树算法',
isPost: true,
isHome: false,
isHighlightShrink: undefined,
isToc: true,
postUpdate: '2025-02-09 10:26:52'
}</script><script>(win=>{
win.saveToLocal = {
set: (key, value, ttl) => {
if (ttl === 0) return
const now = Date.now()
const expiry = now + ttl * 86400000
const item = {
value,
expiry
}
localStorage.setItem(key, JSON.stringify(item))
},
get: key => {
const itemStr = localStorage.getItem(key)
if (!itemStr) {
return undefined
}
const item = JSON.parse(itemStr)
const now = Date.now()
if (now > item.expiry) {
localStorage.removeItem(key)
return undefined
}
return item.value
}
}
win.getScript = (url, attr = {}) => new Promise((resolve, reject) => {
const script = document.createElement('script')
script.src = url
script.async = true
script.onerror = reject
script.onload = script.onreadystatechange = function() {
const loadState = this.readyState
if (loadState && loadState !== 'loaded' && loadState !== 'complete') return
script.onload = script.onreadystatechange = null
resolve()
}
Object.keys(attr).forEach(key => {
script.setAttribute(key, attr[key])
})
document.head.appendChild(script)
})
win.getCSS = (url, id = false) => new Promise((resolve, reject) => {
const link = document.createElement('link')
link.rel = 'stylesheet'
link.href = url
if (id) link.id = id
link.onerror = reject
link.onload = link.onreadystatechange = function() {
const loadState = this.readyState
if (loadState && loadState !== 'loaded' && loadState !== 'complete') return
link.onload = link.onreadystatechange = null
resolve()
}
document.head.appendChild(link)
})
win.activateDarkMode = () => {
document.documentElement.setAttribute('data-theme', 'dark')
if (document.querySelector('meta[name="theme-color"]') !== null) {
document.querySelector('meta[name="theme-color"]').setAttribute('content', '#0d0d0d')
}
}
win.activateLightMode = () => {
document.documentElement.setAttribute('data-theme', 'light')
if (document.querySelector('meta[name="theme-color"]') !== null) {
document.querySelector('meta[name="theme-color"]').setAttribute('content', '#ffffff')
}
}
const t = saveToLocal.get('theme')
if (t === 'dark') activateDarkMode()
else if (t === 'light') activateLightMode()
const asideStatus = saveToLocal.get('aside-status')
if (asideStatus !== undefined) {
if (asideStatus === 'hide') {
document.documentElement.classList.add('hide-aside')
} else {
document.documentElement.classList.remove('hide-aside')
}
}
const detectApple = () => {
if(/iPad|iPhone|iPod|Macintosh/.test(navigator.userAgent)){
document.documentElement.classList.add('apple')
}
}
detectApple()
})(window)</script><meta name="generator" content="Hexo 7.3.0"></head><body><div id="web_bg"></div><div id="sidebar"><div id="menu-mask"></div><div id="sidebar-menus"><div class="avatar-img is-center"><img src="/img/avatar.jpg" onerror="onerror=null;src='/img/friend_404.gif'" alt="avatar"/></div><div class="sidebar-site-data site-data is-center"><a href="/archives/"><div class="headline">文章</div><div class="length-num">18</div></a><a href="/tags/"><div class="headline">标签</div><div class="length-num">9</div></a><a href="/categories/"><div class="headline">分类</div><div class="length-num">2</div></a></div><hr class="custom-hr"/><div class="menus_items"><div class="menus_item"><a class="site-page" href="/"><i class="fa-fw fas fa-home"></i><span> Home</span></a></div><div class="menus_item"><a class="site-page" href="/archives/"><i class="fa-fw fas fa-archive"></i><span> Archives</span></a></div><div class="menus_item"><a class="site-page" href="/tags/"><i class="fa-fw fas fa-tags"></i><span> Tags</span></a></div><div class="menus_item"><a class="site-page" href="/categories/"><i class="fa-fw fas fa-folder-open"></i><span> Categories</span></a></div><div class="menus_item"><a class="site-page group" href="javascript:void(0);"><i class="fa-fw fas fa-list"></i><span> List</span><i class="fas fa-chevron-down"></i></a><ul class="menus_item_child"><li><a class="site-page child" href="/music/"><i class="fa-fw fas fa-music"></i><span> Music</span></a></li><li><a class="site-page child" href="/movies/"><i class="fa-fw fas fa-video"></i><span> Movie</span></a></li></ul></div><div class="menus_item"><a class="site-page" href="/link/"><i class="fa-fw fas fa-link"></i><span> Link</span></a></div><div class="menus_item"><a class="site-page" href="/about/"><i class="fa-fw fas fa-heart"></i><span> About</span></a></div></div></div></div><div class="post" id="body-wrap"><header class="post-bg fixed" id="page-header" style="background-image: url('/img/machinelearning/decision-tree.png')"><nav id="nav"><span id="blog-info"><a href="/" title="QuickReference"><span class="site-name">QuickReference</span></a></span><div id="menus"><div id="search-button"><a class="site-page social-icon search" href="javascript:void(0);"><i class="fas fa-search fa-fw"></i><span> 搜索</span></a></div><div class="menus_items"><div class="menus_item"><a class="site-page" href="/"><i class="fa-fw fas fa-home"></i><span> Home</span></a></div><div class="menus_item"><a class="site-page" href="/archives/"><i class="fa-fw fas fa-archive"></i><span> Archives</span></a></div><div class="menus_item"><a class="site-page" href="/tags/"><i class="fa-fw fas fa-tags"></i><span> Tags</span></a></div><div class="menus_item"><a class="site-page" href="/categories/"><i class="fa-fw fas fa-folder-open"></i><span> Categories</span></a></div><div class="menus_item"><a class="site-page group" href="javascript:void(0);"><i class="fa-fw fas fa-list"></i><span> List</span><i class="fas fa-chevron-down"></i></a><ul class="menus_item_child"><li><a class="site-page child" href="/music/"><i class="fa-fw fas fa-music"></i><span> Music</span></a></li><li><a class="site-page child" href="/movies/"><i class="fa-fw fas fa-video"></i><span> Movie</span></a></li></ul></div><div class="menus_item"><a class="site-page" href="/link/"><i class="fa-fw fas fa-link"></i><span> Link</span></a></div><div class="menus_item"><a class="site-page" href="/about/"><i class="fa-fw fas fa-heart"></i><span> About</span></a></div></div><div id="toggle-menu"><a class="site-page" href="javascript:void(0);"><i class="fas fa-bars fa-fw"></i></a></div></div></nav><div id="post-info"><h1 class="post-title">决策树算法</h1><div id="post-meta"><div class="meta-firstline"><span class="post-meta-date"><i class="far fa-calendar-alt fa-fw post-meta-icon"></i><span class="post-meta-label">发表于</span><time class="post-meta-date-created" datetime="2025-01-24T04:39:59.000Z" title="发表于 2025-01-24 12:39:59">2025-01-24</time><span class="post-meta-separator">|</span><i class="fas fa-history fa-fw post-meta-icon"></i><span class="post-meta-label">更新于</span><time class="post-meta-date-updated" datetime="2025-02-09T02:26:52.799Z" title="更新于 2025-02-09 10:26:52">2025-02-09</time></span><span class="post-meta-categories"><span class="post-meta-separator">|</span><i class="fas fa-inbox fa-fw post-meta-icon"></i><a class="post-meta-categories" href="/categories/machinelearning/">machinelearning</a></span></div><div class="meta-secondline"><span class="post-meta-separator">|</span><span class="post-meta-pv-cv" id="" data-flag-title="决策树算法"><i class="far fa-eye fa-fw post-meta-icon"></i><span class="post-meta-label">阅读量:</span><span id="busuanzi_value_page_pv"><i class="fa-solid fa-spinner fa-spin"></i></span></span></div></div></div></header><main class="layout" id="content-inner"><div id="post"><article class="post-content" id="article-container"><h3 id="C4-5"><a href="#C4-5" class="headerlink" title="C4.5"></a>C4.5</h3><p>C4.5 是一种用于生成决策树的算法,不再使用信息增益,而是使用信息增益比,来避免偏向于选择取值较多的特征。信息增益比是信息增益与特征的熵的比值。</p>
<h3 id="ID3"><a href="#ID3" class="headerlink" title="ID3"></a>ID3</h3><p>D3 是一种基于信息增益Information Gain的决策树算法</p>
<h3 id="Cart"><a href="#Cart" class="headerlink" title="Cart"></a>Cart</h3><p>CART分类与回归树一种决策树算法CART 使用 <strong>二叉树结构</strong>,即每个节点只能有两个子节点。</p>
<h3 id="cart剪枝"><a href="#cart剪枝" class="headerlink" title="cart剪枝"></a>cart剪枝</h3><p>CART 决策树的剪枝方法分为 <strong>预剪枝</strong>Pre-pruning<strong>后剪枝</strong>Post-pruning两种</p>
<h4 id="预剪枝:"><a href="#预剪枝:" class="headerlink" title="预剪枝:"></a><strong>预剪枝</strong></h4><p>预剪枝是在构建决策树时就决定是否停止进一步划分某个节点。主要通过以下标准来控制:</p>
<ul>
<li>当某个节点的样本数小于某个阈值时,不再继续划分。</li>
<li>当某个节点的 Gini 不纯度小于某个阈值时,不再继续划分。<br>预剪枝的优点是能够减少计算量,但缺点是可能会导致模型不够复杂,从而产生欠拟合。</li>
</ul>
<h4 id="后剪枝:"><a href="#后剪枝:" class="headerlink" title="后剪枝:"></a><strong>后剪枝</strong></h4><p>后剪枝是在决策树完全构建出来之后,对树进行修剪。具体过程如下:</p>
<ul>
<li>构建完整的决策树。</li>
<li>从叶子节点开始,逐渐向上遍历树的每个节点。</li>
<li>对每个节点进行判断,是否合适剪去该节点及其子树。如果剪去该子树后,模型的性能没有显著下降,就可以剪枝。<br>后剪枝通过避免过度拟合来提高模型的泛化能力,但其计算开销较大。</li>
</ul>
<h3 id="特征工程-特征提取"><a href="#特征工程-特征提取" class="headerlink" title="特征工程(特征提取)"></a>特征工程(特征提取)</h3><ul>
<li><strong>字典特征提取</strong><br>主要用于处理包含键值对key-value pairs的数据结构<br>   <figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line">   <span class="keyword">from</span> sklearn.feature_extraction <span class="keyword">import</span> DictVectorizer</span><br><span class="line">   <span class="comment"># 字典特征提取</span></span><br><span class="line">   data = [</span><br><span class="line">       &#123;<span class="string">&#x27;city&#x27;</span>: <span class="string">&#x27;beijing&#x27;</span>, <span class="string">&#x27;temperature&#x27;</span>: <span class="number">100</span>&#125;,</span><br><span class="line">       &#123;<span class="string">&#x27;city&#x27;</span>: <span class="string">&#x27;shanghai&#x27;</span>, <span class="string">&#x27;temperature&#x27;</span>: <span class="number">95</span>&#125;,</span><br><span class="line">       &#123;<span class="string">&#x27;city&#x27;</span>: <span class="string">&#x27;guangzhou&#x27;</span>, <span class="string">&#x27;temperature&#x27;</span>: <span class="number">98</span>&#125;</span><br><span class="line">   ]</span><br><span class="line">   transfer = DictVectorizer(sparse=<span class="literal">False</span>)</span><br><span class="line">   new_data = transfer.fit_transform(data)</span><br><span class="line">   <span class="built_in">print</span>(transfer.feature_names_)</span><br><span class="line">   <span class="built_in">print</span>(new_data)</span><br></pre></td></tr></table></figure></li>
<li><strong>文本特征提取</strong><br>主要用于将文本数据(如句子、段落、文章等)转换成数值型特征。这对于文本分类、信息检索等任务非常重要。<br>   <figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br></pre></td><td class="code"><pre><span class="line">   <span class="keyword">from</span> sklearn.feature_extraction.text <span class="keyword">import</span> CountVectorizer</span><br><span class="line">   <span class="comment"># 示例文本数据</span></span><br><span class="line">   data = [</span><br><span class="line">       <span class="string">&quot;I love programming&quot;</span>,</span><br><span class="line">       <span class="string">&quot;Python is great&quot;</span>,</span><br><span class="line">       <span class="string">&quot;I love machine learning&quot;</span></span><br><span class="line">   ]</span><br><span class="line">   <span class="comment"># 创建 CountVectorizer 对象</span></span><br><span class="line">   transfer = CountVectorizer()</span><br><span class="line">   <span class="comment"># 将文本数据转换为特征向量</span></span><br><span class="line">   new_data = transfer.fit_transform(data)</span><br><span class="line">   <span class="comment"># 输出特征名称</span></span><br><span class="line">   <span class="built_in">print</span>(<span class="string">&quot;Feature Names:&quot;</span>, transfer.get_feature_names_out())</span><br><span class="line">   <span class="comment"># 输出转换后的特征矩阵</span></span><br><span class="line">   <span class="built_in">print</span>(<span class="string">&quot;Transformed Data:&quot;</span>, new_data.toarray())</span><br></pre></td></tr></table></figure></li>
<li>文本特征提取(中文文本) <figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br></pre></td><td class="code"><pre><span class="line">  <span class="keyword">from</span> sklearn.feature_extraction.text <span class="keyword">import</span> CountVectorizer</span><br><span class="line">  <span class="keyword">import</span> jieba</span><br><span class="line">  <span class="comment"># 中文文本数据大于20个字</span></span><br><span class="line">  data = [</span><br><span class="line">    <span class="string">&quot;我热爱编程,学习编程语言是一件非常有趣的事情,它能够提升我们解决问题的能力,编程让我变得更加有创意。&quot;</span>,</span><br><span class="line">    <span class="string">&quot;Python语言是一门非常强大的编程语言具有简洁的语法和丰富的库可以帮助开发者更高效地完成任务。&quot;</span>,</span><br><span class="line">    <span class="string">&quot;机器学习是一项非常有前途的技术,它能够让计算机从数据中自动学习,逐步提高模型的精确度,解决实际问题。&quot;</span></span><br><span class="line"></span><br><span class="line">  ]</span><br><span class="line">  <span class="comment"># 使用jieba分词</span></span><br><span class="line">  text_list = []</span><br><span class="line">  <span class="keyword">for</span> line <span class="keyword">in</span> data:</span><br><span class="line">    text_list.append(<span class="string">&quot; &quot;</span>.join(<span class="built_in">list</span>(jieba.cut(line))))</span><br><span class="line">  <span class="comment"># 创建 CountVectorizer 对象</span></span><br><span class="line">  transfer = CountVectorizer()</span><br><span class="line">  <span class="comment"># 将文本数据转换为特征向量</span></span><br><span class="line">  new_data = transfer.fit_transform(text_list)</span><br><span class="line">  <span class="comment"># 输出特征名称</span></span><br><span class="line">  <span class="built_in">print</span>(<span class="string">&quot;Feature Names:&quot;</span>, transfer.get_feature_names_out())</span><br><span class="line">  <span class="comment"># 输出转换后的特征矩阵</span></span><br><span class="line">  <span class="built_in">print</span>(<span class="string">&quot;Transformed Data:&quot;</span>, new_data.toarray())</span><br></pre></td></tr></table></figure></li>
</ul>
<h3 id="tf-idf"><a href="#tf-idf" class="headerlink" title="tf-idf"></a>tf-idf</h3><blockquote>
<p>词频 * 逆文档频率</p>
</blockquote>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># tfi-df</span></span><br><span class="line"><span class="keyword">from</span> sklearn.feature_extraction.text <span class="keyword">import</span> TfidfVectorizer</span><br><span class="line"><span class="keyword">import</span> jieba</span><br><span class="line">data=[<span class="string">&quot;一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。&quot;</span>,</span><br><span class="line"> <span class="string">&quot;我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去&quot;</span>,</span><br><span class="line"> <span class="string">&quot;如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系&quot;</span>]</span><br><span class="line"><span class="built_in">list</span> = []</span><br><span class="line"><span class="keyword">for</span> item <span class="keyword">in</span> data:</span><br><span class="line"> <span class="built_in">list</span>.append(<span class="string">&quot; &quot;</span>.join(jieba.cut(item)))</span><br><span class="line">transfer = TfidfVectorizer()</span><br><span class="line">new_data = transfer.fit_transform(<span class="built_in">list</span>)</span><br><span class="line"><span class="built_in">print</span>(<span class="string">f&quot;特征名字:\n<span class="subst">&#123;transfer.get_feature_names_out()&#125;</span>&quot;</span>)</span><br><span class="line"></span><br><span class="line"><span class="built_in">print</span>(<span class="string">f&quot;转换后的特征矩阵:\n<span class="subst">&#123; new_data.toarray()&#125;</span>&quot;</span>)</span><br><span class="line"><span class="built_in">print</span>(<span class="string">f&quot;转换后的数据:\n<span class="subst">&#123;new_data&#125;</span>&quot;</span>)</span><br></pre></td></tr></table></figure>
<h3 id="回归决策树"><a href="#回归决策树" class="headerlink" title="回归决策树"></a>回归决策树</h3><h4 id="决策树算法的应用-(泰坦尼克号沉船幸存者预测)"><a href="#决策树算法的应用-(泰坦尼克号沉船幸存者预测)" class="headerlink" title="决策树算法的应用 (泰坦尼克号沉船幸存者预测)"></a>决策树算法的应用 (泰坦尼克号沉船幸存者预测)</h4><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> seaborn <span class="keyword">as</span> sns</span><br><span class="line"><span class="keyword">import</span> pandas <span class="keyword">as</span> pd</span><br><span class="line"><span class="keyword">import</span> numpy <span class="keyword">as</span> np</span><br><span class="line"><span class="keyword">from</span> sklearn.model_selection <span class="keyword">import</span> train_test_split</span><br><span class="line"><span class="keyword">from</span> sklearn.feature_extraction <span class="keyword">import</span> DictVectorizer</span><br><span class="line"><span class="keyword">from</span> sklearn.tree <span class="keyword">import</span> DecisionTreeClassifier,export_graphviz</span><br><span class="line"><span class="comment"># 1.获取数据集 - 加载 Titanic 数据集</span></span><br><span class="line">titanic = sns.load_dataset(<span class="string">&#x27;titanic&#x27;</span>)</span><br><span class="line">missing_age_count = titanic[<span class="string">&#x27;age&#x27;</span>].isna().<span class="built_in">sum</span>()</span><br><span class="line"><span class="comment"># print(f&quot;缺失的 age 数量: &#123;missing_age_count&#125;&quot;)</span></span><br><span class="line"><span class="comment"># 2. 数据基本处理</span></span><br><span class="line"><span class="comment"># 2.1 确认特征值、目标值</span></span><br><span class="line">X = titanic[[<span class="string">&#x27;pclass&#x27;</span>,<span class="string">&#x27;age&#x27;</span>,<span class="string">&#x27;sex&#x27;</span>]]</span><br><span class="line">y = titanic[<span class="string">&#x27;survived&#x27;</span>]</span><br><span class="line"><span class="comment"># 2.2 缺失值处理</span></span><br><span class="line">X.loc[:, <span class="string">&#x27;age&#x27;</span>] = X[<span class="string">&#x27;age&#x27;</span>].fillna(value=X[<span class="string">&#x27;age&#x27;</span>].mean()) <span class="comment"># 使用 .loc 进行修改</span></span><br><span class="line"><span class="comment"># 2.3 划分数据集</span></span><br><span class="line">X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=<span class="number">22</span>)</span><br><span class="line"><span class="comment"># 3. 特征工程(字典特征提取)</span></span><br><span class="line">X_train = X_train.to_dict(orient=<span class="string">&quot;records&quot;</span>)</span><br><span class="line">X_test= X_test.to_dict(orient=<span class="string">&quot;records&quot;</span>)</span><br><span class="line">transfer = DictVectorizer()</span><br><span class="line">X_train = transfer.fit_transform(X_train)</span><br><span class="line">X_test = transfer.transform(X_test)</span><br><span class="line"><span class="comment"># 4. 机器学习 决策树算法</span></span><br><span class="line">estimator = DecisionTreeClassifier(criterion=<span class="string">&quot;gini&quot;</span>)</span><br><span class="line">estimator.fit(X_train,y_train)</span><br><span class="line">y_pred = estimator.predict(X_test)</span><br><span class="line"><span class="built_in">print</span>(<span class="string">f&quot;模型的测试集的预测值:<span class="subst">&#123;y_pred&#125;</span>&quot;</span>)</span><br><span class="line">ret = estimator.score(X_test,y_test)</span><br><span class="line"><span class="built_in">print</span>(<span class="string">f&quot;模型的评分:<span class="subst">&#123;ret&#125;</span>&quot;</span>)</span><br><span class="line"><span class="built_in">print</span>(X_test.toarray())</span><br></pre></td></tr></table></figure>
<p>生成对应的图</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> sklearn.tree <span class="keyword">import</span> export_graphviz</span><br><span class="line"><span class="keyword">import</span> graphviz <span class="comment"># 用于渲染图像</span></span><br><span class="line"></span><br><span class="line"><span class="comment"># 导出决策树的 Graphviz 表示</span></span><br><span class="line">export_graphviz(estimator, out_file=<span class="string">&#x27;./data/tree.dot&#x27;</span>, </span><br><span class="line"> feature_names=transfer.get_feature_names_out()) <span class="comment"># 特征名称</span></span><br><span class="line"><span class="comment"># 使用 graphviz 渲染 .dot 文件</span></span><br><span class="line"><span class="keyword">with</span> <span class="built_in">open</span>(<span class="string">&#x27;./data/tree.dot&#x27;</span>, <span class="string">&#x27;r&#x27;</span>) <span class="keyword">as</span> f:</span><br><span class="line"> dot_graph = f.read()</span><br><span class="line"><span class="comment"># 渲染决策树</span></span><br><span class="line">graph = graphviz.Source(dot_graph)</span><br><span class="line"></span><br><span class="line"><span class="comment"># 设置保存路径</span></span><br><span class="line">output_path = <span class="string">&#x27;./data/decision_tree&#x27;</span> <span class="comment"># 自定义保存路径</span></span><br><span class="line"></span><br><span class="line"><span class="comment"># 保存图像到指定路径,格式可以是 .png, .pdf, .jpg 等</span></span><br><span class="line"><span class="comment"># graph.render(output_path, format=&#x27;png&#x27;) # 保存为 .png 文件</span></span><br><span class="line"></span><br><span class="line"><span class="comment"># 显示图像</span></span><br><span class="line">graph.view(output_path) <span class="comment"># 打开图像path为保存路径不需要加后缀</span></span><br><span class="line"></span><br></pre></td></tr></table></figure>
<p><a target="_blank" rel="noopener" href="http://webgraphviz.com/">Webgraphviz</a>,这个网站可以将<code>tree.dot</code>文件的内容生成对应的可视化树</p>
<h4 id="回归决策树与线性回归的对比"><a href="#回归决策树与线性回归的对比" class="headerlink" title="回归决策树与线性回归的对比"></a>回归决策树与线性回归的对比</h4><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> numpy <span class="keyword">as</span> np</span><br><span class="line"><span class="keyword">import</span> matplotlib.pyplot <span class="keyword">as</span> plt</span><br><span class="line"><span class="keyword">from</span> sklearn.linear_model <span class="keyword">import</span> LinearRegression</span><br><span class="line"><span class="keyword">from</span> sklearn.tree <span class="keyword">import</span> DecisionTreeRegressor</span><br><span class="line"><span class="keyword">from</span> matplotlib <span class="keyword">import</span> rcParams</span><br><span class="line"></span><br><span class="line"><span class="comment"># 设置matplotlib使用的字体为SimHei黑体</span></span><br><span class="line">rcParams[<span class="string">&#x27;font.sans-serif&#x27;</span>] = [<span class="string">&#x27;SimHei&#x27;</span>] <span class="comment"># 也可以使用 &#x27;Microsoft YaHei&#x27;</span></span><br><span class="line">rcParams[<span class="string">&#x27;axes.unicode_minus&#x27;</span>] = <span class="literal">False</span> <span class="comment"># 正常显示负号</span></span><br><span class="line">x = np.array(<span class="built_in">list</span>(<span class="built_in">range</span>(<span class="number">1</span>,<span class="number">11</span>))).reshape(-<span class="number">1</span>,<span class="number">1</span>)</span><br><span class="line">y = ([<span class="number">5.56</span>,<span class="number">5.70</span>,<span class="number">5.91</span>,<span class="number">6.40</span>,<span class="number">6.80</span>,<span class="number">7.05</span>,<span class="number">8.90</span>,<span class="number">8.70</span>,<span class="number">9.00</span>,<span class="number">9.05</span>])</span><br><span class="line"></span><br><span class="line">m1 = DecisionTreeRegressor(max_depth=<span class="number">1</span>)</span><br><span class="line">m2 = DecisionTreeRegressor(max_depth=<span class="number">3</span>)</span><br><span class="line">m3 = DecisionTreeRegressor()</span><br><span class="line"></span><br><span class="line"><span class="comment"># 模型训练</span></span><br><span class="line">m1.fit(x,y)</span><br><span class="line">m2.fit(x,y)</span><br><span class="line">m3.fit(x,y)</span><br><span class="line"></span><br><span class="line"><span class="comment"># 模型预测</span></span><br><span class="line">x_test = np.arange(<span class="number">0</span>,<span class="number">10</span>,<span class="number">0.01</span>).reshape(-<span class="number">1</span>,<span class="number">1</span>)</span><br><span class="line">y_1 = m1.predict(x_test)</span><br><span class="line">y_2 = m2.predict(x_test)</span><br><span class="line">y_3 = m3.predict(x_test)</span><br><span class="line"></span><br><span class="line"><span class="comment"># 结果展示</span></span><br><span class="line">plt.figure(figsize=(<span class="number">10</span>,<span class="number">6</span>),dpi=<span class="number">100</span>)</span><br><span class="line">plt.scatter(x,y ,label = <span class="string">&quot;data&quot;</span>)</span><br><span class="line">plt.plot(x_test,y_1,label = <span class="string">&quot;max_depth=1&quot;</span>)</span><br><span class="line">plt.plot(x_test,y_2,label = <span class="string">&quot;max_depth=3&quot;</span>)</span><br><span class="line">plt.plot(x_test,y_3,label = <span class="string">&quot;linearregression&quot;</span>)</span><br><span class="line">plt.xlabel(<span class="string">&quot;数据&quot;</span>)</span><br><span class="line">plt.ylabel(<span class="string">&quot;预测值&quot;</span>)</span><br><span class="line">plt.legend()</span><br><span class="line">plt.show()</span><br></pre></td></tr></table></figure>
</article><div class="post-copyright"><div class="post-copyright__author"><span class="post-copyright-meta"><i class="fas fa-circle-user fa-fw"></i>文章作者: </span><span class="post-copyright-info"><a href="https://rq.shenjianl.cn">shenjianZ</a></span></div><div class="post-copyright__type"><span class="post-copyright-meta"><i class="fas fa-square-arrow-up-right fa-fw"></i>文章链接: </span><span class="post-copyright-info"><a href="https://rq.shenjianl.cn/posts/95.html">https://rq.shenjianl.cn/posts/95.html</a></span></div><div class="post-copyright__notice"><span class="post-copyright-meta"><i class="fas fa-circle-exclamation fa-fw"></i>版权声明: </span><span class="post-copyright-info">本博客所有文章除特别声明外,均采用 <a href="https://qr.shenjianl.cn/licenses/by-nc-sa/4.0/" target="_blank">CC BY-NC-SA 4.0</a> 许可协议。转载请注明来自 <a href="https://rq.shenjianl.cn" target="_blank">QuickReference</a></span></div></div><div class="tag_share"><div class="post-meta__tag-list"><a class="post-meta__tags" href="/tags/decisiontree/">decisiontree</a></div><div class="post_share"><div class="social-share" data-image="/img/machinelearning/decision-tree.png" data-sites="facebook,twitter,wechat,weibo,qq"></div><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/butterfly-extsrc@1.1.3/sharejs/dist/css/share.min.css" media="print" onload="this.media='all'"><script src="https://cdn.jsdelivr.net/npm/butterfly-extsrc@1.1.3/sharejs/dist/js/social-share.min.js" defer></script></div></div><nav class="pagination-post" id="pagination"><div class="prev-post pull-left"><a href="/posts/8816.html" title="集成学习"><img class="cover" src="/img/machinelearning/ensemble-learning.png" onerror="onerror=null;src='/img/404.jpg'" alt="cover of previous post"><div class="pagination-info"><div class="label">上一篇</div><div class="prev_info">集成学习</div></div></a></div><div class="next-post pull-right"><a href="/posts/60504.html" title="逻辑回归"><div class="cover" style="background: var(--default-bg-color)"></div><div class="pagination-info"><div class="label">下一篇</div><div class="next_info">逻辑回归</div></div></a></div></nav></div><div class="aside-content" id="aside-content"><div class="card-widget card-info"><div class="is-center"><div class="avatar-img"><img src="/img/avatar.jpg" onerror="this.onerror=null;this.src='/img/friend_404.gif'" alt="avatar"/></div><div class="author-info__name">shenjianZ</div><div class="author-info__description">一份快捷简便的文档,便于查阅编程的细节</div></div><div class="card-info-data site-data is-center"><a href="/archives/"><div class="headline">文章</div><div class="length-num">18</div></a><a href="/tags/"><div class="headline">标签</div><div class="length-num">9</div></a><a href="/categories/"><div class="headline">分类</div><div class="length-num">2</div></a></div><a id="card-info-btn" target="_blank" rel="noopener" href="https://github.com/shenjianz"><i class="fab fa-github"></i><span>Follow Me</span></a><div class="card-info-social-icons is-center"><a class="social-icon" href="https://github.com/shenjianZ" target="_blank" title="Github"><i class="fab fa-github" style="color: #24292e;"></i></a><a class="social-icon" href="mailto:15202078626@163.com" target="_blank" title="Email"><i class="fas fa-envelope" style="color: #4a7dbe;"></i></a></div></div><div class="card-widget card-announcement"><div class="item-headline"><i class="fas fa-bullhorn fa-shake"></i><span>公告</span></div><div class="announcement_content">一个简单快捷的文档知识点查阅网站</div></div><div class="sticky_layout"><div class="card-widget" id="card-toc"><div class="item-headline"><i class="fas fa-stream"></i><span>目录</span><span class="toc-percentage"></span></div><div class="toc-content is-expand"><ol class="toc"><li class="toc-item toc-level-3"><a class="toc-link" href="#C4-5"><span class="toc-number">1.</span> <span class="toc-text">C4.5</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#ID3"><span class="toc-number">2.</span> <span class="toc-text">ID3</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#Cart"><span class="toc-number">3.</span> <span class="toc-text">Cart</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#cart%E5%89%AA%E6%9E%9D"><span class="toc-number">4.</span> <span class="toc-text">cart剪枝</span></a><ol class="toc-child"><li class="toc-item toc-level-4"><a class="toc-link" href="#%E9%A2%84%E5%89%AA%E6%9E%9D%EF%BC%9A"><span class="toc-number">4.1.</span> <span class="toc-text">预剪枝:</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#%E5%90%8E%E5%89%AA%E6%9E%9D%EF%BC%9A"><span class="toc-number">4.2.</span> <span class="toc-text">后剪枝:</span></a></li></ol></li><li class="toc-item toc-level-3"><a class="toc-link" href="#%E7%89%B9%E5%BE%81%E5%B7%A5%E7%A8%8B-%E7%89%B9%E5%BE%81%E6%8F%90%E5%8F%96"><span class="toc-number">5.</span> <span class="toc-text">特征工程(特征提取)</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#tf-idf"><span class="toc-number">6.</span> <span class="toc-text">tf-idf</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#%E5%9B%9E%E5%BD%92%E5%86%B3%E7%AD%96%E6%A0%91"><span class="toc-number">7.</span> <span class="toc-text">回归决策树</span></a><ol class="toc-child"><li class="toc-item toc-level-4"><a class="toc-link" href="#%E5%86%B3%E7%AD%96%E6%A0%91%E7%AE%97%E6%B3%95%E7%9A%84%E5%BA%94%E7%94%A8-%EF%BC%88%E6%B3%B0%E5%9D%A6%E5%B0%BC%E5%85%8B%E5%8F%B7%E6%B2%89%E8%88%B9%E5%B9%B8%E5%AD%98%E8%80%85%E9%A2%84%E6%B5%8B%EF%BC%89"><span class="toc-number">7.1.</span> <span class="toc-text">决策树算法的应用 (泰坦尼克号沉船幸存者预测)</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#%E5%9B%9E%E5%BD%92%E5%86%B3%E7%AD%96%E6%A0%91%E4%B8%8E%E7%BA%BF%E6%80%A7%E5%9B%9E%E5%BD%92%E7%9A%84%E5%AF%B9%E6%AF%94"><span class="toc-number">7.2.</span> <span class="toc-text">回归决策树与线性回归的对比</span></a></li></ol></li></ol></div></div><div class="card-widget card-recent-post"><div class="item-headline"><i class="fas fa-history"></i><span>最新文章</span></div><div class="aside-list"><div class="aside-list-item"><a class="thumbnail" href="/posts/8816.html" title="集成学习"><img src="/img/machinelearning/ensemble-learning.png" onerror="this.onerror=null;this.src='/img/404.jpg'" alt="集成学习"/></a><div class="content"><a class="title" href="/posts/8816.html" title="集成学习">集成学习</a><time datetime="2025-01-25T07:12:08.000Z" title="发表于 2025-01-25 15:12:08">2025-01-25</time></div></div><div class="aside-list-item"><a class="thumbnail" href="/posts/95.html" title="决策树算法"><img src="/img/machinelearning/decision-tree.png" onerror="this.onerror=null;this.src='/img/404.jpg'" alt="决策树算法"/></a><div class="content"><a class="title" href="/posts/95.html" title="决策树算法">决策树算法</a><time datetime="2025-01-24T04:39:59.000Z" title="发表于 2025-01-24 12:39:59">2025-01-24</time></div></div><div class="aside-list-item no-cover"><div class="content"><a class="title" href="/posts/60504.html" title="逻辑回归">逻辑回归</a><time datetime="2025-01-20T07:30:08.000Z" title="发表于 2025-01-20 15:30:08">2025-01-20</time></div></div><div class="aside-list-item no-cover"><div class="content"><a class="title" href="/posts/52662.html" title="线性回归">线性回归</a><time datetime="2025-01-19T08:46:51.000Z" title="发表于 2025-01-19 16:46:51">2025-01-19</time></div></div><div class="aside-list-item no-cover"><div class="content"><a class="title" href="/posts/12462.html" title="C lang">C lang</a><time datetime="2025-01-15T12:41:26.000Z" title="发表于 2025-01-15 20:41:26">2025-01-15</time></div></div></div></div></div></div></main><footer id="footer" style="background: transparent"><div id="footer-wrap"><div class="copyright">&copy;2024 - 2025 By shenjianZ</div><div class="framework-info"><span>框架 </span><a target="_blank" rel="noopener" href="https://hexo.io">Hexo</a><span class="footer-separator">|</span><span>主题 </span><a target="_blank" rel="noopener" href="https://github.com/jerryc127/hexo-theme-butterfly">Butterfly</a></div><div class="footer_custom_text"><a target="_blank" rel="noopener" href="https://beian.miit.gov.cn/#/Integrated/recordQuery"><img class="icp-icon" src="https://beian.mps.gov.cn/img/logo01.dd7ff50e.png"><span>备案号豫ICP备2023019300号</span></a></div></div></footer></div><div id="rightside"><div id="rightside-config-hide"><button id="readmode" type="button" title="阅读模式"><i class="fas fa-book-open"></i></button><button id="darkmode" type="button" title="浅色和深色模式转换"><i class="fas fa-adjust"></i></button><button id="hide-aside-btn" type="button" title="单栏和双栏切换"><i class="fas fa-arrows-alt-h"></i></button></div><div id="rightside-config-show"><button id="rightside-config" type="button" title="设置"><i class="fas fa-cog fa-spin"></i></button><button class="close" id="mobile-toc-button" type="button" title="目录"><i class="fas fa-list-ul"></i></button><button id="go-up" type="button" title="回到顶部"><span class="scroll-percent"></span><i class="fas fa-arrow-up"></i></button></div></div><div><script src="/js/utils.js?v=4.13.0"></script><script src="/js/main.js?v=4.13.0"></script><script src="https://cdn.jsdelivr.net/npm/@fancyapps/ui@5.0.33/dist/fancybox/fancybox.umd.min.js"></script><div class="js-pjax"></div><script src="https://cdn.jsdelivr.net/npm/butterfly-extsrc@1.1.3/dist/activate-power-mode.min.js"></script><script>POWERMODE.colorful = true;
POWERMODE.shake = true;
POWERMODE.mobile = false;
document.body.addEventListener('input', POWERMODE);
</script><script async data-pjax src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script><div id="local-search"><div class="search-dialog"><nav class="search-nav"><span class="search-dialog-title">搜索</span><span id="loading-status"></span><button class="search-close-button"><i class="fas fa-times"></i></button></nav><div class="is-center" id="loading-database"><i class="fas fa-spinner fa-pulse"></i><span> 数据库加载中</span></div><div class="search-wrap"><div id="local-search-input"><div class="local-search-box"><input class="local-search-box--input" placeholder="搜索文章" type="text"/></div></div><hr/><div id="local-search-results"></div><div id="local-search-stats-wrap"></div></div></div><div id="search-mask"></div><script src="/js/search/local-search.js?v=4.13.0"></script></div></div></body></html>