目录
工具使用
kettle,Navicat,Tableau,xmind
数据集
理解数据(数据预处理后)
user_id
用户ID
item_id
商品ID
category_id
商品类别ID
behavior_type
行为类型
pv:商品详情页pv,等价于点击
buy:商品购买
cart:将商品加入购物车
fav:收藏商品
timestamps
时间戳
datetimes/dates/times/hours
时间
导入一亿条数据
1.命令提示符
1.1在命令提示符中进入MySQL
1.2在MySQL中建立相应的数据库以及表
2.kettle
2.1新建转换——输入-CSV文件输入——输出-表输出
2.2打开CSV——浏览——选择数据集中的.CSV文件——取消"包含列头行"——获取字段——用命令提示符中的字段名称替换掉kettle中的名称并且修改字段的长度
2.3打开表输出——新建数据库连接——选择MYSQL——填写连接名称(connect mysql)、主机名称(localhost),数据库名称(taobao),端口号、用户名、密码——点击测试
2.4打开连接池(表输出)——使用连接词——设置初始大小、最大空闲空间(皆为100)——将自动取消设置为false
2.5选项(表输出)——设置参数
2.6表输出——提交记录数量10000——勾选指定数据字段-点击获取字段
2.7主页将输入拖动到输出(主输入步骤)——表输出右键-改变开始复制的数量(10~20)
2.8表输出——浏览目标表——user_behavior——点击获取字段——填写数据库字段和字段流
2.9填写表输出的数据库字段中的流字段——运行-启动-命名
3.命令提示符(测试是否导入成功)
数据预处理
定义:数据前处理是指根据需要增减字段或值,主要应用于字段名、字段类型、字段约束、值清空、去重、去异常。
[En]
Definition: data preprocessing refers to adding or decreasing fields or values according to requirements, which are mainly applied to field names, field types, field constraints, value de-emptying, deduplicating, and de-exception.
use taobao;
desc user_behavior;
SELECT *
FROM user_behavior
LIMIT 5;
-- timestamp同数据类型的timestamp重名,需修改
/*
修改字段名
alter table 表名 change 旧字段名 新字段名 新数据类型
*/
ALTER TABLE user_behavior CHANGE timestamp timestamps int(14);
desc user_behavior;
-- 检查空值
SELECT *
FROM user_behavior
WHERE user_id IS NULL;
SELECT *
FROM user_behavior
WHERE item_id IS NULL;
SELECT *
FROM user_behavior
WHERE category_id IS NULL;
SELECT *
FROM user_behavior
WHERE behavior_type IS NULL;
SELECT *
FROM user_behavior
WHERE timestamps IS NULL;
-- 检查重复值(这里查询的是组合的重复值)
/*
重复值查询
select 检查的字段
from 表
group by 检查的字段
having count(检查字段)>1
*/
SELECT user_id,item_id,timestamps
FROM user_behavior
GROUP BY user_id,item_id,timestamps
HAVING COUNT(*)>1;
-- 去重
/*
添加一个字段并且居于首位
alter table 表名 add 字段名 字段类型 first
*/
ALTER TABLE user_behavior ADD id int FIRST;
SELECT *
FROM user_behavior
LIMIT 5;
/*
修改字段为主键并且自增
alter table 表名 modify 字段名 字段类型 primary key auto_increment
对已经存在的表,添加主键约束
alter table 表名 modify 字段名 字段类型 primary key
自增
auto_increment
*/
ALTER TABLE user_behavior MODIFY id INT PRIMARY KEY auto_increment;
/*
数据的去重
1 找到重复值,并命名
1.1 用group by 将字段组合起来,会将重复的值压缩为一条
1.2 用count(*)查找重复的数量
1.3 将重复的字段展现出来,并且找到重复值中最小的id号
1.4 将找到的重复值作为子查询命名
2 去重
2.1 用where筛选
2.2 子查询的字段和外查询的字段相同时删除数据,必须同时满足数据
2.3 user_behavior.id > t2.id (删除重复值的范围,只保留group by中他的最小id的数据)
注意:delete from后跟子查询,就不应使用DELETE FROM 表,而应该DELETE 表 FROM 表,
*/
DELETE user_behavior
FROM user_behavior,
(
SELECT user_id,item_id,timestamps,MIN(id) id
FROM user_behavior
GROUP BY user_id,item_id,timestamps
HAVING COUNT(*)>1
) t2
WHERE user_behavior.user_id = t2.user_id
AND user_behavior.item_id = t2.item_id
AND user_behavior.timestamps = t2.timestamps
AND user_behavior.id > t2.id;
-- 新增字段:date time hour
-- 更改buffer值(更改缓冲值的大小)
-- 查看缓冲值现在的大小
/*
查看缓冲池,并对他进行修改
1 show variables 查看系统变量及其值
2 like模糊查询
3 set global 全局变量的设置,后面跟赋值公式
*/
SHOW VARIABLES LIKE '%_buffer%';
-- 设缓冲值为10G
SET GLOBAL innodb_buffer_pool_size = 10700000000;
-- TIMESTAMP(0) 0的意思是秒后面的位数为0
-- 添加一个时间字段,并且秒后面的位数为0
ALTER TABLE user_behavior ADD datetimes TIMESTAMP(0);
-- int型的TIMESTAMP改为日期型的TIMESTAMP(天啦,运行了5个小时,我要减少一下数据量了)
/*
将int型的TIMESTAMP改为日期型的TIMESTAMP
1 更新已有数据
update 表名 set 字段=值
2 int的时间戳修改为日期型
from_unixtime(时间戳)
*/
UPDATE user_behavior SET datetimes = FROM_UNIXTIME(timestamps);
-- 数据过多删除某些行,只保留50w数据
DELETE FROM user_behavior
WHERE id > 500000;
-- 插入dates、times、hours
-- 添加 日期、时间、小时的字段,并设置数据类型和长度
ALTER TABLE user_behavior ADD dates char(10);
ALTER TABLE user_behavior ADD times char(8);
ALTER TABLE user_behavior ADD hours char(2);
/* SUBSTRING(str,start, length):str:代表字符串,start:代表是从第几位开始截取,length:代表截取几位数
*/
UPDATE user_behavior set dates = SUBSTRING(datetimes,1,10),
times = SUBSTRING(datetimes,12,8),
hours = SUBSTRING(datetimes,12,2);
-- 去异常——时间范围的异常(2017.11.25-12.3)
SELECT MAX(datetimes),MIN(datetimes)
FROM user_behavior;
-- 异常的去除方法
DELETE FROM user_behavior
WHERE datetimes < "2017-11-25 00:00:00"
OR datetimes > "2017-12-03 23:59:59";
-- 数据概览
DESC user_behavior;
SELECT *
FROM user_behavior
LIMIT 5;
SELECT COUNT(*) FROM user_behavior; #499779条数据
获客情况
-- 创建临时表,临时表的字段名和原表相同,但是无数据
/*
创建临时表格,并复制原表的字段名,不复制数据
create table 临时表名 like 原表
*/
CREATE TABLE temp_behavior
LIKE user_behavior;
-- 截取原表中的100000条数据插入临时表中
/*
截取原表的数据插入临时表中
insert into 临时表 select 列 from 表 limit 数据量
*/
INSERT INTO temp_behavior
SELECT *
FROM user_behavior
LIMIT 100000;
-- 查看临时表数据
SELECT *
FROM temp_behavior;
-- PV 页面浏览量 每天页面的点击数
SELECT dates,COUNT(*) "pv"
FROM temp_behavior
WHERE behavior_type = "pv"
GROUP BY dates
ORDER BY dates;
-- UV 独立的访客数 去除每天相同客户的点击
SELECT dates,COUNT(DISTINCT user_id) "uv"
FROM temp_behavior
WHERE behavior_type = "pv"
GROUP BY dates
ORDER BY dates;
-- uv、pv
SELECT dates,
COUNT(*) "pv",
COUNT(DISTINCT user_id) "uv",
ROUND(COUNT(*)/COUNT(DISTINCT user_id),1) "pv/uv"
FROM temp_behavior
WHERE behavior_type = "pv"
GROUP BY dates
ORDER BY dates;
-- 创建获客情况表格
/*
创建表格并定义字段
create table 表名(字段1 字段类型1 ,字段2 字段类型2)
decimal型(十进制小数型)比float和double计算精度高,使用空间少,常用于金融运算。
1 浮点型会四舍五入,decimal则不会
2 decimal(a,b)
a:一共保留几位数
b:小数点后保留几位数
*/
CREATE TABLE pv_uv_puv(
dates char(10),
pv int(9),
uv int(9),
puv decimal(10,1)
);
-- 取原表中的数据插入获客情况表
/*
round(a,b)
a:要处理的数
b:保留几位小数
*/
INSERT INTO pv_uv_puv
SELECT dates,
COUNT(*) "pv",
COUNT(DISTINCT user_id) "uv",
ROUND(COUNT(*)/COUNT(DISTINCT user_id),1) "pv/uv"
FROM user_behavior
WHERE behavior_type = "pv"
GROUP BY dates
ORDER BY dates;
-- 查看获客情况表
SELECT *
FROM pv_uv_puv
ORDER BY dates;
-- *本次获客情况表无空值,若出现空值,可使用以下办法
DELETE FROM pv_uv_puv WHERE dates IS NULL;
留存情况(list23)
SELECT * FROM user_behavior WHERE dates IS NULL;
SELECT *
FROM temp_behavior;
-- 留存率:活跃用户留存率,计算次日留存率
-- 用户活跃日期
SELECT user_id,dates
FROM temp_behavior
GROUP BY user_id,dates;
-- 自关联
SELECT * FROM
(SELECT user_id,dates
FROM temp_behavior
GROUP BY user_id,dates
) a
,(SELECT user_id,dates
FROM temp_behavior
GROUP BY user_id,dates)
b
WHERE a.user_id=b.user_id;
-- 筛选
SELECT * FROM
(SELECT user_id,dates
FROM temp_behavior
GROUP BY user_id,dates
) a
,(SELECT user_id,dates
FROM temp_behavior
GROUP BY user_id,dates)
b
WHERE a.user_id=b.user_id AND a.dates < b.dates;
-- 留存数
SELECT a.dates,
COUNT(if(datediff(b.dates,a.dates)=0,b.user_id,NULL)) rentention0,
COUNT(if(datediff(b.dates,a.dates)=1,b.user_id,NULL)) rentention1
FROM
(SELECT user_id,dates
FROM temp_behavior
GROUP BY user_id,dates
) a
,(SELECT user_id,dates
FROM temp_behavior
GROUP BY user_id,dates)
b
WHERE a.user_id=b.user_id AND a.dates
时间序列分析(list23)
1.在sql中创建date_hour_behavior
SELECT dates,hours,
COUNT(IF(behavior_type = 'pv',behavior_type,NULL)) 'pv',
COUNT(IF(behavior_type = 'cart',behavior_type,NULL)) 'cart',
COUNT(IF(behavior_type = 'fav',behavior_type,NULL)) 'fav',
COUNT(IF(behavior_type = 'buy',behavior_type,NULL)) 'buy'
FROM temp_behavior
GROUP BY dates,hours
ORDER BY dates,hours
-- 存储
CREATE TABLE date_hour_behavior AS
(SELECT dates,hours,
COUNT(IF(behavior_type = 'pv',behavior_type,NULL)) 'pv',
COUNT(IF(behavior_type = 'cart',behavior_type,NULL)) 'cart',
COUNT(IF(behavior_type = 'fav',behavior_type,NULL)) 'fav',
COUNT(IF(behavior_type = 'buy',behavior_type,NULL)) 'buy'
FROM user_behavior
GROUP BY dates,hours
ORDER BY dates,hours)
-- 检查
SELECT * FROM date_hour_behavior
2.tableau
2.1 分析用户的周内行为
11月25日、26日,12月2日、3日是周末,用户在周末的各种行为均高于工作日
2.2 分析用户的小时行为
2.3 共同分析用户的周内行为和小时行为
2.4 分析用户下单购买的概率
用户转化率分析(list23)
-- 统计各类行为用户数
SELECT behavior_type,
COUNT(DISTINCT user_id) use_num
FROM temp_behavior
GROUP BY behavior_type
ORDER BY behavior_type DESC
-- 存储
CREATE TABLE behavior_user_num AS
SELECT behavior_type,
COUNT(DISTINCT user_id) use_num
FROM user_behavior
GROUP BY behavior_type
ORDER BY behavior_type DESC
SELECT * FROM behavior_user_num
-- 0.6631(购买概率)
SELECT 3444/5194
-- 统计各类行为的数量
SELECT behavior_type,
COUNT(*) behavior_count_num
FROM temp_behavior
GROUP BY behavior_type
ORDER BY behavior_type DESC
-- 储存数据
CREATE TABLE behavior_num AS
SELECT behavior_type,
COUNT(*) behavior_count_num
FROM user_behavior
GROUP BY behavior_type
ORDER BY behavior_type DESC
SELECT * FROM behavior_num
-- buy/pv 0.0228
SELECT 10226/448998
-- (fav+cart)/pv 0.0903
SELECT (13013+27542)/448998
行为路径分析(list24)
-- 删除后面使用temp_behavior创建的视图,并使用原表进行视图的再次创建
DROP VIEW user_behavior_view
DROP VIEW user_behavior_standard
DROP VIEW user_behavior_path
DROP VIEW path_count
-- 在查询中对字段内的内容进行筛选可用if
CREATE VIEW user_behavior_view AS
SELECT user_id,item_id,
COUNT(IF(behavior_type = 'pv',behavior_type,NULL)) pv,
COUNT(IF(behavior_type = 'cart',behavior_type,NULL)) cart,
COUNT(IF(behavior_type = 'fav',behavior_type,NULL)) fav,
COUNT(IF(behavior_type = 'buy',behavior_type,NULL)) buy
FROM temp_behavior
GROUP BY user_id,item_id
-- 用户行为标准化
CREATE VIEW user_behavior_standard AS
SELECT user_id,item_id,
(CASE WHEN pv>0 THEN 1 ELSE 0 END) '浏览了',
(CASE WHEN fav>0 THEN 1 ELSE 0 END) '收藏了',
(CASE WHEN cart>0 THEN 1 ELSE 0 END) '加购了',
(CASE WHEN buy>0 THEN 1 ELSE 0 END) '购买了'
FROM user_behavior_view
-- 路径类型
CREATE VIEW user_behavior_path AS
SELECT *,
CONCAT(浏览了,收藏了,加购了,购买了) 购买路径类型
FROM user_behavior_standard AS a
WHERE a.购买了> 0
-- 统计各类购买行为数量
CREATE VIEW path_count AS
SELECT 购买路径类型,count(*) 数量
FROM user_behavior_path
GROUP BY 购买路径类型
ORDER BY 数量 DESC
-- 解释表
CREATE TABLE explanation(
path_type char(4),
description VARCHAR(40)
)
INSERT INTO explanation
VALUES('0001','直接购买'),
('1001','浏览后购买'),
('0011','加购后购买'),
('1011','浏览加购后购买'),
('0101','收藏后购买'),
('1101','浏览收藏后购买'),
('0111','收藏加购后购买'),
('1111','浏览收藏加购后购买')
SELECT * FROM explanation
SELECT * FROM explanation e
JOIN path_count pc
ON pc.购买路径类型 = e.path_type
ORDER BY 数量 DESC
-- 使用原始表
-- 在查询中对字段内的内容进行筛选可用if
CREATE VIEW user_behavior_view AS
SELECT user_id,item_id,
COUNT(IF(behavior_type = 'pv',behavior_type,NULL)) pv,
COUNT(IF(behavior_type = 'cart',behavior_type,NULL)) cart,
COUNT(IF(behavior_type = 'fav',behavior_type,NULL)) fav,
COUNT(IF(behavior_type = 'buy',behavior_type,NULL)) buy
FROM user_behavior
GROUP BY user_id,item_id
-- 用户行为标准化
CREATE VIEW user_behavior_standard AS
SELECT user_id,item_id,
(CASE WHEN pv>0 THEN 1 ELSE 0 END) '浏览了',
(CASE WHEN fav>0 THEN 1 ELSE 0 END) '收藏了',
(CASE WHEN cart>0 THEN 1 ELSE 0 END) '加购了',
(CASE WHEN buy>0 THEN 1 ELSE 0 END) '购买了'
FROM user_behavior_view
-- 路径类型
CREATE VIEW user_behavior_path AS
SELECT *,
CONCAT(浏览了,收藏了,加购了,购买了) 购买路径类型
FROM user_behavior_standard AS a
WHERE a.购买了> 0
-- 统计各类购买行为数量
CREATE VIEW path_count AS
SELECT 购买路径类型,count(*) 数量
FROM user_behavior_path
GROUP BY 购买路径类型
ORDER BY 数量 DESC
-- 解释表
CREATE TABLE explanation(
path_type char(4),
description VARCHAR(40)
)
INSERT INTO explanation
VALUES('0001','直接购买'),
('1001','浏览后购买'),
('0011','加购后购买'),
('1011','浏览加购后购买'),
('0101','收藏后购买'),
('1101','浏览收藏后购买'),
('0111','收藏加购后购买'),
('1111','浏览收藏加购后购买')
SELECT * FROM explanation
SELECT * FROM explanation e
JOIN path_count pc
ON pc.购买路径类型 = e.path_type
ORDER BY 数量 DESC
CREATE TABLE path_result AS
SELECT path_type,description,数量
FROM explanation e
JOIN path_count pc
ON pc.购买路径类型 = e.path_type
ORDER BY 数量 DESC
SELECT *
FROM path_result
-- 购买但没有收藏加购的购买数量 7785
SELECT sum(buy)
FROM user_behavior_view
WHERE buy>0 AND fav=0 AND cart=0
-- 总购买量 10226
-- 收藏加购后购买量 2441
-- 收藏加购 13013+27542
-- 收藏加购后购买的概率 0.0602
SELECT 2441/(13013+27542)
RFM模型(list28)
Original: https://blog.csdn.net/tossing_a_coin/article/details/124420284
Author: tossing_a_coin
Title: 【MySQL实战】基于100万电商用户的1亿条行为数据分析——自发秩序
相关阅读
Title: Numpy报错:ImportError: numpy.core.multiarray failed to import
导入自定义的 python 模块时,出现以下报错:
ImportError: numpy.core.multiarray failed to import
from .cv2 import *
ImportError: numpy.core.multiarray failed to import
原因:
numpy 版本过低或者过高
解决:
- 查看numpy 版本:
pip show numpy
我当前环境中的 numpy 版本是:Version: 1.16.5
- 升级:
pip install -U numpy
(tensorflow) Robin-macbook-pro:~ robin$ pip install -U numpy
Collecting numpy
Downloading https://files.pythonhosted.org/packages/6a/9d/984f87a8d5b28b1d4afc042d8f436a76d6210fb582214f35a0ea1db3be66/numpy-1.19.5-cp36-cp36m-macosx_10_9_x86_64.whl (15.6MB)
|████████████████████████████████| 15.6MB 1.3MB/s
ERROR: tensorflow 1.13.1 has requirement protobuf>=3.6.1, but you'll have protobuf 3.6.0 which is incompatible.
Installing collected packages: numpy
Found existing installation: numpy 1.16.5
Uninstalling numpy-1.16.5:
Successfully uninstalled numpy-1.16.5
Successfully installed numpy-1.19.5
结果还是不行,遂给 numpy 降级: pip install -U numpy==1.14.0
(之前是 1.16.5
)
不仅造成了不少冲突,而且没效果:
(tensorflow) Robin-macbook-pro:~ robin$ pip install -U numpy==1.14.0
Collecting numpy==1.14.0
Downloading https://files.pythonhosted.org/packages/33/c4/1ea5344793c159556110e42c94c9374cb08ce2a2727374cd467bd97f6579/numpy-1.14.0-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (4.7MB)
|████████████████████████████████| 4.7MB 230kB/s
ERROR: tensorflow 1.13.1 has requirement protobuf>=3.6.1, but you'll have protobuf 3.6.0 which is incompatible.
ERROR: pmdarima 1.3.0 has requirement numpy>=1.16, but you'll have numpy 1.14.0 which is incompatible.
ERROR: phik 0.9.8 has requirement numpy>=1.15.4, but you'll have numpy 1.14.0 which is incompatible.
ERROR: librosa 0.8.0 has requirement numpy>=1.15.0, but you'll have numpy 1.14.0 which is incompatible.
ERROR: astropy 4.0 has requirement numpy>=1.16, but you'll have numpy 1.14.0 which is incompatible.
Installing collected packages: numpy
Found existing installation: numpy 1.19.5
Uninstalling numpy-1.19.5:
Successfully uninstalled numpy-1.19.5
Successfully installed numpy-1.14.0
还是不行:
将numpy更到最新版本: pip install -U numpy
,
同时更新 opencv 的版本试试,这是当前版本:
本身就是最新版本,尝试过没用。
后来发现,问题在于
- ①
import numpy as numpy
会报错:ImportError: numpy.core.multiarray failed to import
- ②
import cv2
会报错:AttributeError: module 'logging' has no attribute 'Handler'
最后发现,最为离奇诡异的是,在不同的文件夹下面执行相同的代码( import numpy as np
)是没有任何问题的
(1)
/Users/robin/software/anaconda3/envs/tensorflow/bin/python3.6 /Users/robin/MLcode/Pycharm_Project/tensorflow/2021/0823_face_recognition_environment/test.py
Process finished with exit code 0
(2)
/Users/robin/software/anaconda3/envs/tensorflow/bin/python3.6 /Users/robin/MLcode/Pycharm_Project/tensorflow/2021/0823_face_recognition_environment/ui/test.py
Traceback (most recent call last):
File "/Users/robin/MLcode/Pycharm_Project/tensorflow/2021/0823_face_recognition_environment/ui/test.py", line 1, in <module>
import numpy as np
File "/Users/robin/software/anaconda3/envs/tensorflow/lib/python3.6/site-packages/numpy/__init__.py", line 187, in <module>
from .testing import Tester
File "/Users/robin/software/anaconda3/envs/tensorflow/lib/python3.6/site-packages/numpy/testing/__init__.py", line 10, in <module>
from unittest import TestCase
File "/Users/robin/software/anaconda3/envs/tensorflow/lib/python3.6/unittest/__init__.py", line 59, in <module>
from .case import (TestCase, FunctionTestCase, SkipTest, skip, skipIf,
File "/Users/robin/software/anaconda3/envs/tensorflow/lib/python3.6/unittest/case.py", line 278, in <module>
class _CapturingHandler(logging.Handler):
AttributeError: module 'logging' has no attribute 'Handler'
Process finished with exit code 1
最后我放弃治疗了,新建了一个文件夹,将文件移动过去了,就当做 Pycharm 抽风了吧
浪费一下午时间...!!!!
Original: https://blog.csdn.net/Robin_Pi/article/details/120544691
Author: Robin_Pi
Title: Numpy报错:ImportError: numpy.core.multiarray failed to import

启政务新视界,2022华为数字机器人秋季发布会9月8日正式开启

在vue中使用jsx语法

普通话转粤语_免费音频转文字,好用的录音转文字软件

数据压缩1 | 浊音&清音&爆破音时域及频域特性

使用Lenet-5识别手写数字(含简单GUI测试,简单详细版)

没有显卡怎么使用anaconda配置tensorflow深度学习环境

Aspect-based Sentiment Analysis with Type-aware Graph Convolutional Networks and Layer Ensemble 阅读笔记

opencv(11):训练自己的opencv级联分类器

线扫相机的使用

聚类模型的简单介绍

GAN生成对抗网络—-手写数据实现

Python-OpenCV的基本使用

ENVI分类后处理

知识图到文本的生成——肆
