numpy、pandas、matplotlib学习

python

发布日期: 2020-11-02

文章字数: 8.9k

阅读次数:

Numpy

菜鸟教程：https://www.runoob.com/numpy/numpy-ndarray-object.html

numpy API

Pythonlist只能与整数相乘，在这种情况下，将list重复的元素：

>>> [1,2,3] * 3
[1, 2, 3, 1, 2, 3, 1, 2, 3]

如果要进行矢量运算，请numpy.ndarray改用：

>>> import numpy as np
>>> ar = np.array([1,2,3])
>>> ar * 3
array([3, 6, 9])
#numpy数组索引
pre=pre[:,:,0]#不用三个中括号，只用一个，里面只用一个逗号

numpy广播机制

axis详解

虽然是tf的教程，但是类似

0维，又称0维张量，数字，标量：1 ()
1维，又称1维张量，数组，vector：[1, 2, 3] (3)
2维，又称2维张量，矩阵，二维数组：[[1,2], [3,4]] (2,2)
3维，又称3维张量，立方（cube），三维数组：[[[5,6], [7,8]]] (1,2,2) ,越往里axis越大，axis是(0,1,2)每个3维的有一个2维的，每个2维的有2个1维的，每个1维的有2个标量,数字7的坐标是[0,1,0]
n维：你应该get到点了吧~

本来是2x3x4,tf.reduce_sum(tensor, axis=0)，加完了就变成了3x4

文件存储npy\npz

Numpy能够读写磁盘上的文本数据或二进制数据。

将数组以二进制格式保存到磁盘

np.load和np.save是读写磁盘数组数据的两个主要函数，默认情况下，数组是以未压缩的原始二进制格式保存在扩展名为.npy的文件中。

# 读取
import numpy as np
a=np.arange(5)
np.save('test.npy',a)
# 存储
import numpy as np
a=np.load('test.npy')
print(a)

np.savez以未压缩的 .npz 格式将多个数组保存到单个文件中。

np.savez(outfile, x, y)
npzfile = np.load(outfile)
npzfile.files
['arr_0', 'arr_1']
np.savez(outfile, x=x, y=y)
npzfile = np.load(outfile)
sorted(npzfile.files)
['x', 'y']

常用函数


a = np.zeros(shape=(3,4,5))
# 返回a列表的元素总数：60
print(a.size)
print(np.size(a))
# 返回a列表的维度大小：(3,4,5)
print(a.shape)
print(np.shape(a))
# 返回a列表的第一维大小：3
print(len(a))

np.array(list)#将列表转化为数组
np.std()#求标准差
np.dot(a,b)#矩阵乘
np.max()#
np.multiply(a,b)#element wise
a*b#element wise
np.where(condition[,x,y])
#返回根据条件从x或y中选择的元素。条件为真，返回x，否则返回y。
#<<<a
#array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
#<<<(a < 5, a, 10*a)
#array([ 0,  1,  2,  3,  4, 50, 60, 70, 80, 90])
np.unique(x)
#显示np数组中的unique值，还有返回index值等功能
np.ceil()#向上取整 
np.floor()#向下取整
np.round(a,decimals=0,out=None)#将数组四舍五入到指定的小数上
np.expand_dims(a,axis)->ndarray    eg:img_nd = np.expand_dims(img_nd, axis=2)
#img_nd原先是二维的(0,1),扩张完变(0,1,2)

#调整数组维度顺序
np.transpose(a,axes=None) eg:img_nd.transpose((2, 0, 1)) 或 np.transpose(img_nd,(2,0,1))
#(a,b,c)->(c,a,b)
ndarray.max(axis=None, out=None)#返回指定维度的最大值,没指定的话就是所有的最大值
np.newaxis
#作用是增加一个维度
#a=np.array([1,2,3,4,5])
#aa=a[:,np.newaxis]
#print(aa.shape)  (5,1) 现有5个,5个里每个再有1个
#print (aa)      [[1],[2],[3],[4],[5]] 
np.concatenate((a1, a2, ...), axis=0)
#将数组序列拼接在一起
>>> a = np.array([[1, 2], [3, 4]])#(2,2)
>>> b = np.array([[5, 6]])#(1,2)
>>> np.concatenate((a, b), axis=0)#拼接完->(3,2)
array([[1, 2],
       [3, 4],
       [5, 6]])
>>> np.concatenate((a, b.T), axis=1)(2,2)+(2,1)->(2,3)
array([[1, 2, 5],
       [3, 4, 6]])
numpy.ravel(a, order='C')
ndarray.flatten(order='C')
#都是将多维数组降为一维，区别为:
#ndarray.flatten(order='C')返回拷贝，对拷贝的修改不会影响原数组
#numpy.ravel(a, order='C')返回视图，对视图的修改会影响原数组
#order:使用这个索引顺序读取a的元素。有'C' 'F' 'A' 'K'
#'C'意味着以行主、C风格的顺序索引元素，最后一个轴索引变化最快，回到第一个轴索引变化最慢。eg:[0,0,0],[0,0,1],[0,0,2],[0,1,0],[0,1,1],...
#'F'表示以fortran风格的列主顺序索引元素，第一个索引变化最快，最后一个索引变化最慢。注意，' C '和' F '选项不考虑底层数组的内存布局，只参考轴索引的顺序。eg:[0,0,0],[1,0,0],[2,0,0],[0,1,0],[1,1,0],[2,1,0],...
#'A'表示如果a在内存中是Fortran连续的，则以类似Fortran的索引顺序读取元素，否则以类似c的顺序读取
#'K'的意思是按照元素在内存中出现的顺序读取它们，除非在步数为负时反转数据。默认情况下，使用'C'索引顺序。
np.spacing(x)
#返回x和最近相邻数字之间的距离
#可以视为eps的一个繁华
np.argmax(array,axis)
#用于返回一个numpy数值中最大值的索引,当一组中同时出现几个最大值时，返回第一个最大值的索引值。
np.divide()
# 和/作用相同,就是除以
np.bincount(x, weights=None, minlength=None)
# 计算非负整数数组中每个值的出现次数。
# bin的个数至少比最大的x的值大一，如果设置了minlength,则至少有minlength个bin
>>> np.bincount(np.array([0, 1, 1, 3, 2, 1, 7]))
# 输出:array([1, 3, 1, 1, 0, 0, 0, 1])
np.diag(array) 
# 当array是一个一维数组时,会返回一个以array为对角线元素的矩阵
# 当array是一个二维矩阵时，会返回该矩阵的对角线元素
np.nanmean(array, axis)
# 计算指定axis的平均,忽略nan
>>> a = np.array([[1, np.nan], [3, 4]])
>>> np.nanmean(a)
2.6666666666666665
# 从一个可迭代对象创建一个新的一维数组。
numpy.fromiter(iter, dtype, count=- 1, *, like=None)
iterable = (x*x for x in range(5))
np.fromiter(iterable, float)
array([  0.,   1.,   4.,   9.,  16.])
# asfortranarray()返回在内存中以Fortran顺序排列的数组(ndim>= 1)
numpy.asfortranarray(a, dtype=None, *, like=None)
x = np.arange(6).reshape(2,3)
y = np.asfortranarray(x)
x.flags['F_CONTIGUOUS']
>>> False
y.flags['F_CONTIGUOUS']
>>> True
# 用0替换nan,用有限数字替换inf。
np.nan_to_num(x)
x = np.array([np.inf, -np.inf, np.nan, -128, 128])
>>> np.nan_to_num(x)
array([  1.79769313e+308,  -1.79769313e+308,   0.00000000e+000,
        -1.28000000e+002,   1.28000000e+002])
# 计算数据的直方图
hist,bins=np.histogram(a # array-like数据,但是必须被flattened
             , bins=10,# 如果是int,给定等宽bin的数量
             range=None,# range的上下界,左开右闭,除了最后一个
             normed=False, weights=None, density=None)
# hist是每个bin的数量，bins是边界 如
>>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3])
(array([0, 2, 1]), array([0, 1, 2, 3]))
# Return the cumulative sum of the elements along a given axis. 给定轴的累加
np.cumsum(a,# 输入的array
          axis=None,# 所指定的轴,如果未指定,就是对flatten array操作
          dtype=None, out=None)
# 上下翻转数组。
np.flipud(m:array_like)
# 左右翻转数组。
np.fliplr(m:array_like)
# 将数组逆时针旋转90度。前两个维度旋转。
np.rot90(m:array_like,k=1:旋转次数)
# 
np.flatnonzero()
numpy.tile(A, reps)
# Construct an array by repeating A the number of times given by reps.
# 和torch.clamp一样的用法,就是把数据夹紧到一个区间范围内
numpy.clip(a, a_min, a_max, out=None)
# 数组的拼接
>>> m = np.array([[1, 2], [3, 4]])
>>> m
array([[1, 2],
       [3, 4]])
>>> n = np.array([[5, 6], [7, 8]])
>>> n
array([[5, 6],
       [7, 8]])
# 水平拼接,输出为:
array([[1, 2, 5, 6],
       [3, 4, 7, 8]])
# 拼接的方法有：
np.append(m,n, axis=1)
np.c_[m, n]
np.concatenate((m, n), axis=1)
np.hstack((m, n))
# 竖直拼接,输出为：
# 拼接的方法有
np.r_[m, n]
np.append(m, n, axis=0)
np.concatenate((m, n))
np.vstack((m, n))
# 数组生成
# np.linspace()和np.arrange()一个是样本数量，一个步长
#调整数组顺序
np.transpose()

Pandas

教程：https://www.yiibai.com/pandas/python_pandas_environment_setup.html

API:https://pandas.pydata.org/pandas-docs/stable/reference/index.html

# 查看序列的index
Series.index
# Return a Series containing counts of unique values.结果对象将按降序排列，因此第一个元素是最常出现的元素。 默认情况下排除 NA 值。
index = pd.Index([3, 1, 2, 3, 4, np.nan])
index.value_counts()
3.0    2
1.0    1
2.0    1
4.0    1
dtype: int64
Series.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=True)
# 获取DataFrame的行数和列数
df.shape[0]//行数
df.shape[1]//列数
# 读取csv文件,
df = pd.read_csv('train.csv')
# 用于返回DataFrame类型的数据，如果不使用该函数,则输出结果为数据的前面五行和末尾五行。中间部分以...代替。
print(df.to_string())
# 将DataFrame保存为csv文件
df.to_csv('site.csv')
# head()读取前面的n行,如果不填参数n,默认返回5行。
print(df.head(10))
# 返回每个group的前5行
GroupBy.head(n=5)
# tail()读取尾部的n行,如果不填参数n，默认返回5行，空行各个字段的值返回NaN。
print(df.tail(10))
# info()返回表格的一些基本信息
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457          # 行数，458 行，第一行编号为 0
Data columns (total 9 columns):            # 列数，9列
 #   Column    Non-Null Count  Dtype       # 各列的数据类型
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object         # non-null，意思为非空的数据    
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)                 # 类型
# 删除'B','C'两列
df.drop(columns=['B', 'C'])
# 
df.columns
# duplicated()返回数据是否重复
person = {
  "name": ['Google', 'Runoob', 'Runoob', 'Taobao'],
  "age": [50, 40, 40, 23]  
}
df = pd.DataFrame(person)
print(df.duplicated())
0    False
1    False
2     True
3    False
dtype: bool
# drop_duplicates()删除重复数据
DataFrame.drop_duplicates(subset=None,# 仅考虑用于识别重复项的某些列，默认情况下使用所有列。 
                          keep='first',# {'first', 'last', False},分别是除第一次出现全删除,除最后一次出现全删除,删除所有重复项
                          inplace=False,# Whether to drop duplicates in place or to return a copy.
                          ignore_index=False)
# 1.删除完全重复数据
data.drop_duplicates(inplace=True)
# 数据清洗时，会将带空值的行删除，此时DataFrame或Series类型的数据不再是连续的索引，可以使用reset_index()重置索引。
df.reset_index()
    index   class  max_speed # 之前的index被添加成一个column
0  falcon    bird      389.0
1  parrot    bird       24.0
2    lion  mammal       80.5
3  monkey  mammal        NaN
# 如果设置df.reset_index(drop=True),则之前的index不被添加成一个column
# 利用groupby之后也可以利用reset_index来回转
DataFrame.groupby(by=None,#用于确定groupby的组:mapping,function,label,or list of labels
                  axis=0,#Split along rows (0) or columns (1):{0 or ‘index’, 1 or ‘columns’}
                  level=None, as_index=True, sort=True, group_keys=True, squeeze=NoDefault.no_default, observed=False, dropna=True)
df = pd.DataFrame({'Animal': ['Falcon', 'Falcon','Parrot', 'Parrot'],
                   'Max Speed': [380., 370., 24., 26.]})
df.groupby(['Animal']).mean()
        Max Speed
Animal
Falcon      375.0
Parrot       25.0
# agg()在指定轴上使用一项或多项操作进行聚合。
DataFrame.agg(func=None,# 用于聚合数据的函数。如果是函数,则必须在传递DataFrame或传递给DataFrame.apply时工作：function, str, list or dict
              axis=0,# {0 or ‘index’,1 or ‘columns’}, default 0
              *args, **kwargs)
# The return can be:
# scalar : when Series.agg is called with single function
# Series : when DataFrame.agg is called with a single function
# DataFrame : when DataFrame.agg is called with several functions
# Return scalar, Series or DataFram
df = pd.DataFrame({"id":[1,2,3,4],"name":[[0,5,8],[0,4,7],[7,8,7],[7,7,5]]})
print(df.groupby("id")["name"].agg(list))
id
1    [[0, 5, 8]]
2    [[0, 4, 7]]
3    [[7, 8, 7]]
4    [[7, 7, 5]]
Name: name, dtype: object

DataFrame.copy(deep=True)
# deep=True,深拷贝,deep=False,浅拷贝
print(type(df.name))
# <class 'pandas.core.series.Series'> 调用一列就是一个Series
# map()根据输入对应关系映射Series的值。用于将Series中的每个值替换为另一个值,该值可能来自function,dict
Series.map(arg,# Mapping correspondence. 
           na_action=None)
s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
s.map({'cat': 'kitten', 'dog': 'puppy'})
.loc[] 
# 通过标签或布尔数组访问一组行和列。
            max_speed  shield
cobra               1       2
viper               4       5
sidewinder          7       8
df.loc['viper']# return a Series
df.loc[['viper', 'sidewinder']]# [[]]returns a DataFrame
df.loc['cobra', 'shield']# single label for row and column 返回类型是string
df.loc[:,["id","cell_type"]]# 先行后列,逗号分割
# 纯粹基于整数位置的索引，用于按位置选择。
df.iloc[1:3, 0:3]
# 只要每个group的第一个,和那个drop_duplicates起到的作用是一样的。
df.groupby('id').agg('first')

# DataFrame.iterrows(),Iterate over DataFrame rows as (index, Series) pairs.
df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
row = next(df.iterrows())[1]
row
(0,
 int      1.0
 float    1.5
 Name: 0, dtype: float64)
# 要以 Pandas 的方式迭代遍历DataFrame的行，可以使用：
for index, row in df.iterrows():
    print (row["c1"], row["c2"]
# 将Series转换成DataFrame
Series.to_frame(name=NoDefault.no_default)
>>>s = pd.Series(["a", "b", "c"],
              name="vals")
>>>s.to_frame()
  vals
0    a
1    b
2    c

groupby详解

https://zhuanlan.zhihu.com/p/101284491

group = data.groupby("company")
group
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002B7E2650240>

生成了一个DataFrameGroupBy对象,为了看看group内部究竟是什么,将其转换成list来查看:

In [8]: list(group)
Out[8]:
[('A',   company  salary  age
  3       A      20   22
  6       A      23   33), 
 ('B',   company  salary  age
  4       B      10   17
  5       B      21   40
  8       B       8   30), 
 ('C',   company  salary  age
  0       C      43   35
  1       C      17   25
  2       C       8   30
  7       C      49   19)]

列表由三个元组组成,每个元组中,第一个元素是组别（这里是按照company进行分组，所以最后分为了A,B,C），第二个元素的是对应组别下的DataFrame,整个过程可以图解如下：

索引实际上变成了A,group by就是把index由0~N变成了A这一列

agg聚合操作

聚合操作是groupby后非常常见的操作，会写SQL的朋友对此应该是非常熟悉了。聚合操作可以用来求和、均值、最大值、最小值等。

# tqdm对于pandas提供了支持
import pandas as pd
import numpy as np
from tqdm import tqdm

df = pd.DataFrame(np.random.randint(0, 100, (100000, 6)))
# Apply a function along an axis of the DataFrame.
DataFrame.apply(func,# 应用于每一列或每一行的函数。
                axis=0,# {0 or ‘index’, 1 or ‘columns’}, default 0
#  or ‘columns’:apply function to each row. 0 or ‘index’: apply function to each column.
                raw=False, result_type=None,# {‘expand’, ‘reduce’, ‘broadcast’, None}, default None
# These only act when axis=1 (columns):None取决于函数的返回值类型： list-like results will be returned as a Series of those. However if the apply function returns a Series these are expanded to columns.

#‘expand’ : list-like results will be turned into columns.
#‘reduce’ : returns a Series if possible rather than expanding list-like results. This is the opposite of ‘expand’.
#‘broadcast’ : results will be broadcast to the original shape of the DataFrame, the original index and columns will be retained.
                args=(), **kwargs)
# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm.gui.tqdm`, `tqdm.notebook.tqdm`, optional kwargs, etc.)
tqdm.pandas(desc="my bar!")
# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
df.progress_apply(lambda x: x**2)
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)
# Merge DataFrame or named Series objects with a database-style join.
# A named Series object is treated as a DataFrame with a single named column.
df.apply()
DataFrame.merge(right,# DataFrame or named Series 合并的对象
                how='inner',# {‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}, default ‘inner’
                on=None,# label or list. Column or index level names to join on. 
                # 必须在两个 DataFrame 中都可以找到。 如果 on 是 None 并且不合并索引，则默认为两个 DataFrame 中列的交集。
                left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
# 合并方式
left: use only keys from left frame, similar to a SQL left outer join; preserve key order.
right: use only keys from right frame, similar to a SQL right outer join; preserve key order.
outer: use union of keys from both frames, similar to a SQL full outer join; sort keys lexicographically.
inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.
cross: creates the cartesian product from both frames, preserves the order of the left keys.
>>>df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
>>>df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
>>>df1.merge(df2, how='inner', on='a')
      a  b  c
0   foo  1  3
# 使用布尔表达式查询DataFrame的列。
df.query(expr, inplace=False, **kwargs)
# expr 要评估的查询字符串;可以在环境中引用变量,在变量前面加上@字符(@a+b);
df = pd.DataFrame({'A': range(1, 6),
                   'B': range(10, 0, -2),
                   'C C': range(10, 5, -1)})
df.query('A > B')
   A  B  C C
4  5  2    6
df.isna()
df.isnull()
# 都是用来判断是否为nan，完全相同，两个函数是为了模仿R的DataFrame

Matplotlib

易百教程：https://www.yiibai.com/matplotlib

API：https://matplotlib.org/stable/api/index.html

plg/fig/ax的区别

知乎教程

在matplotlib中,有两种画图方式：

plt.figure()：plt.xxx系列。通过plt.xxx来画图,其实是取了一个捷径。这是通过matplotlib提供的一个api,这个plt提供了很多基本的function可以让你很快的画出图来,但是如果你想要更细致的精调,就要使用另外一种方法。
```
plt.figure(1)  
plt.subplot(211)   
plt.plot(A,B)   
plt.show()
```
fig, ax = plt.subplots(): 这个就是正统的稍微复杂一点的画图方法了。指定figure和axes,然后对axes单独操作。等下就讲figure和axes都神马意思。
```
fig, ax = plt.subplots()   
ax.plot(A,B)
```

强烈建议在初学matplotlib的时候，尽量避免使用plt.xxx系列。当你明白figure/axes/axis都是控制什么的时候，如果你想要简单的制作一个quick and dirty的图，用plt.xxx才是OK。

matplotlib的名词定于对于非英语母语的人来说实在是太不友好了,尤其是axes。仰天长啸。

Figure:fig = plt.figure(): 可以解释为画布。
- 画图的第一件事，就是创建一个画布figure，然后在这个画布上加各种元素。
Axes:ax = fig.add_subplot(1,1,1): 不想定义,没法定义,就叫他axes！
- 首先，这个不是你画图的xy坐标抽！
- 希望当初写这个lib的时候他们用一个更好的名字。。。
- 可以把axes理解为你要放到画布上的各个物体。比如你要画一个太阳,一个房子,一个车在画布上,那么太阳是一个axes,房子是一个axes,etc。
- 如果你的figure只有一张图,那么你只有一个axes。如果你的figure有subplot，那么每一个subplot就是一个axes。
- axes是matlibplot的宇宙中心!axes下可以修改编辑的变量非常多,基本上能包含你的所有需求。
Axis:ax.xaxis/ax.yaxis: 对,这才是你的xy坐标轴。
- 每个坐标轴实际上也是由竖线和数字组成的,每一个竖线其实也是一个axis的subplot，因此ax.xaxis也存在axes这个对象。对这个axes进行编辑就会修改xaxis图像上的表现。
Artist:基本上，在 Figure上可见的所有东西都是 Artist（甚至是 Figure、Axes 和 Axis 对象）。这包括 Text 对象、Line2D 对象、集合对象、Patch 对象等。当Figure被渲染时，所有的Artists都被绘制到画布上。大多数Artists都被绑在axes上；这样的artist不能被多个轴共享，也不能从一个轴移动到另一个轴。

「为什么plt没有指定画布和区域也能作图？」

因为matplotlib默认在最近创建的画布上绘制,而当你没有指定区域,告诉它去画图,他就会自动去生成一个Axes去绘制,进一步没有画布,也会自动去创建一个Figure,也称为隐式绘制。

图像的各个部位名称

使用ax标准的流程：

创建一个画布(Figure)
创建一个或者多个Axes
使用ax.xxxx在指定Axes上绘图

首先，搞个画布

我喜欢用subplots这个命令来开始画图。哪怕你没有subplot,也可以用这个subplots来创建一个画布。

这个function创建了一个大小为(14,7)的画布,把这个画布赋值给变量fig,同时在这个画布上创建了一个axes,把这个axes赋值给ax。这样,所有未来的fig.xxx都是对这个画布的操作，所有ax.xxx都是对这个axes的操作。

如果你有两个图,那么ax是一个有两个元素ax[0],ax[1]的list。ax[0]就对应第一个subplot的ax。

fig, ax = plt.subplots(figsize=(14,7))
# fig, ax = plt.subplots(2,1,figsize=(14,7))
# ax[0].xxx
# ax[1].xxx

好了画布搞好了，画数据。

注意,我们这里依然不使用plt!因为我们要在这个axes上画数据,因此就用ax.plot()来画。画完第一个再call一次,再画第二个。

ax.plot(A,B)
ax.plot(B,A)

下面开始细节的处理

数据画好了就可以各种细调坐标轴啊,tick啊之类的。首先把标题和xy坐标轴的标题搞定。Again,不用plt。直接在axes上进行设定。

ax.set_title('Title',fontsize=18)
ax.set_xlabel('xlabel', fontsize=18,fontfamily = 'sans-serif',fontstyle='italic')
ax.set_ylabel('ylabel', fontsize='x-large',fontstyle='oblique')
ax.legend()

后是xy坐标轴的一些属性设定, 也是在axes level上完成的。

ax.set_aspect('equal') 
ax.minorticks_on() 
ax.set_xlim(0,16) 
ax.grid(which='minor', axis='both')

最后是坐标轴tick和细节,这个在axes.xaxisoraxes.yaxis上完成。

ax.xaxis.set_tick_params(rotation=45,labelsize=18,colors='w') 
start, end = ax.get_xlim() 
ax.xaxis.set_ticks(np.arange(start, end,1)) 
ax.yaxis.tick_right()

画图的时候,请坚持使用ax格式。

# 依次创建fig,ax
fig = plt.figure()
a1 = fig.add_axes([0,0,1,1])
x = np.arange(1,11)
a1.plot(x,np.exp(x))
a1.set_ylabel('exp')
fig.legend(labels = ('exp','log'),loc='upper left')
plt.show()
# 利用subplots直接获得fig,ax
N = 45
x, y = np.random.rand(2, N)
c = np.random.randint(1, 5, size=N)
s = np.random.randint(10, 220, size=N)

fig, ax = plt.subplots()

scatter = ax.scatter(x, y, c=c, s=s)

# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(),
                    loc="lower left", title="Classes")
ax.add_artist(legend1)

# produce a legend with a cross section of sizes from the scatter
handles, labels = scatter.legend_elements(prop="sizes", alpha=0.6)
legend2 = ax.legend(handles, labels, loc="upper right", title="Sizes")

plt.show()

可视化中间特征图

把网络中间某层的输出的特征图按通道作为图片进行可视化展示即可，如下述代码所示：

import matplotlib.pyplot as plt
#get feature map of layer_activation
plt.matshow(layer_activation[0, :, :, 4], cmap='viridis')

cmap参数设置

疑难解惑

1.整数还是浮点

为什么例如cityscape的labelID图片用plt.imread读取后再输出无法输出整数而是输出浮点数？

def imread(fname, format=None):
    from urllib import parse
    if format is None:
        if isinstance(fname, str):
            parsed = parse.urlparse(fname)
            # If the string is a URL (Windows paths appear as if they have a
            # length-1 scheme), assume png.
            # 如果不是windows路径而是url,则假设是png图片
            if len(parsed.scheme) > 1:
                ext = 'png'
            # 是windows路径,取后缀
            else:
                ext = Path(fname).suffix.lower()[1:]
        elif hasattr(fname, 'geturl'):  # Returned by urlopen().
            ext = 'png'
        elif hasattr(fname, 'name'):
            ext = Path(fname.name).suffix.lower()[1:]
        else:
            ext = 'png'
    else:
        ext = format
    # 是png的话采用PIL.PngImagePlugin.PngImageFile而不是PIL.Image.open
    img_open = (
        PIL.PngImagePlugin.PngImageFile if ext == 'png' else PIL.Image.open)
    if isinstance(fname, str):
        parsed = parse.urlparse(fname)
        if len(parsed.scheme) > 1:  # Pillow doesn't handle URLs directly.
            _api.warn_deprecated(
                "3.4", message="Directly reading images from URLs is "
                "deprecated since %(since)s and will no longer be supported "
                "%(removal)s. Please open the URL for reading and pass the "
                "result to Pillow, e.g. with "
                "``PIL.Image.open(urllib.request.urlopen(url))``.")
            # hide imports to speed initial import on systems with slow linkers
            from urllib import request
            ssl_ctx = mpl._get_ssl_context()
            if ssl_ctx is None:
                _log.debug(
                    "Could not get certifi ssl context, https may not work."
                )
            with request.urlopen(fname, context=ssl_ctx) as response:
                import io
                try:
                    response.seek(0)
                except (AttributeError, io.UnsupportedOperation):
                    response = io.BytesIO(response.read())
                return imread(response, format=ext)
    with img_open(fname) as image:
        # 是png采用_pil_png_to_float_array()而不是pil_to_array()
        return (_pil_png_to_float_array(image)
                if isinstance(image, PIL.PngImagePlugin.PngImageFile) else
                pil_to_array(image))
def _pil_png_to_float_array(pil_png):
    """Convert a PIL `PNGImageFile` to a 0-1 float array."""
    mode = pil_png.mode
    rawmode = pil_png.png.im_rawmode
    if rawmode == "1":  # Grayscale.
        return np.asarray(pil_png).astype(np.float32)
    if rawmode == "L;2":  # Grayscale.
        return np.divide(pil_png, 2**2 - 1, dtype=np.float32)
    if rawmode == "L;4":  # Grayscale.
        return np.divide(pil_png, 2**4 - 1, dtype=np.float32)
    # rawmode是L,所以被除以了255。
    if rawmode == "L":  # Grayscale.
        return np.divide(pil_png, 2**8 - 1, dtype=np.float32)
    if rawmode == "I;16B":  # Grayscale.
        return np.divide(pil_png, 2**16 - 1, dtype=np.float32)
    if mode == "RGB":  # RGB.
        return np.divide(pil_png, 2**8 - 1, dtype=np.float32)
    if mode == "P":  # Palette.
        return np.divide(pil_png.convert("RGBA"), 2**8 - 1, dtype=np.float32)
    if mode == "LA":  # Grayscale + alpha.
        return np.divide(pil_png.convert("RGBA"), 2**8 - 1, dtype=np.float32)
    if mode == "RGBA":  # RGBA.
        return np.divide(pil_png, 2**8 - 1, dtype=np.float32)
    raise ValueError(f"Unknown PIL rawmode: {rawmode}")

pil_png.png是<PIL.PngImagePlugin.PngStream object,a = np.array(pil_png) or plt.imshow(pil_png)之后,pil_png.png都会变为None(暂时不知道为什么)

所以其实乘以255就会得到正确的

2.cmap问题

为什么plt.imshow()显示上面的被变成小数的array会是彩色的?

matplotlib.pyplot.imshow(X,# array-like or PIL image
                         cmap=None,# 颜色图谱 
                         norm=None,# 归一化,将scalar归一化到[0,1],默认的话使用线性缩放，最小值映射到0,最大值映射到1
                         aspect=None, interpolation=None, alpha=None, vmin=None, vmax=None, origin=None, extent=None, *, filternorm=True, filterrad=4.0, resample=None, url=None, data=None, **kwargs)

Display data as an image, i.e., on a 2D regular raster.

The input may either be actual RGB(A) data, or 2D scalar data, which will be rendered as a pseudocolor伪彩色 image. For displaying a grayscale image set up the colormapping using the parameters cmap='gray', vmin=0, vmax=255.cmap其实不就是PIL的调色板模式吗？！

对于X,the image data. Supported array shapes are:

(M, N): an image with scalar data. The values are mapped to colors using normalization and a colormap. See parameters norm, cmap, vmin, vmax.
(M, N, 3): an image with RGB values (0-1 float or 0-255 int).
(M, N, 4): an image with RGBA values (0-1 float or 0-255 int), i.e. including transparency.

The first two dimensions (M, N) define the rows and columns of the image.

Out-of-range RGB(A) values are clipped.

autumn	红-橙-黄
bone	黑-白，x线
cool	青-洋红
copper	黑-铜
flag	红-白-蓝-黑
gray	黑-白
hot	黑-红-黄-白
hsv	hsv颜色空间，红-黄-绿-青-蓝-洋红-红
inferno	黑-红-黄
jet	蓝-青-黄-红
magma	黑-红-白
pink	黑-粉-白
plasma	绿-红-黄
prism	红-黄-绿-蓝-紫-…-绿模式
spring	洋红-黄
summer	绿-黄
viridis	蓝-绿-黄
winter	蓝-绿

cmap不输入的话默认是viridis模式

When using scalar data and no explicit norm, vmin and vmax define the data range that the colormap covers. By default, the colormap covers the complete value range of the supplied data. It is deprecated to use vmin/vmax when norm is given. When using RGB(A) data, parameters vmin/vmax are ignored.

颜色

基本函数

根据RGB CMYK HSL HSV XYZ 值来查询颜色

import matplotlib.image as imgplt
import matplotlib.pyplot as plt
x = imgplt.imread('label.png')
plt.imshow(x[300:350,150:200])
plt.show()
#matplotlib显示图像，注意x只用一个[]，不同维度用逗号分割！
#imread的返回值是图片数据data，数据类型是class:`numpy.array`。这个图片数据data的维度如下：
#- (M, N) 对于灰度级图片
#- (M, N, 3) 对于RGB彩色图片.
#- (M, N, 4) 对于RGBA彩色图片
plt.colorbar()
# colorbar()如下图中红框标出

# Create a new figure, or activate an existing figure.
plt.figure(num=None,# A unique identifier for the figure.
           figsize=None,# 指定figure的宽和高，单位为英寸；
           dpi=None,
           facecolor=None,# 背景颜色
           edgecolor=None,# 边界颜色
           frameon=True,# 是否显示边框
           FigureClass=<class 'matplotlib.figure.Figure'>, clear=False, **kwargs)
# 图片的保存
plt.imsave(filename,X,format='png')
# 创建单个子图
plt.subplot(nrows,ncows,index)
#nrows:行数 ncols:列数 index:索引值,图片放在第几个窗格
# Create a figure and a set of subplots.
# 创建多个子图
pyplot.subplots(nrows=1,# 行数
                ncols=1,# 列数
                *, sharex=False, sharey=False, squeeze=True, subplot_kw=None, gridspec_kw=None,                          **fig_kw# 所有其他关键字参数都传递给 pyplot.figure 调用。
               )-> fig, ax
# ticks:设置刻度 labels:设置刻度标签
plt.xticks(ticks=None,labels=None,**kwargs)
# xlabel: label的文本
plt.xlabel(xlabel:str)
# 将图片展示到figure上
plt.imshow(train_images[i], cmap=plt.cm.binary)
#train_images[i]:必须是array-like或PIL image
#cmap:用于将标量数据映射到颜色的Colormap实例或已注册的colormap名称
# 显示figure
plt.show()
# 将当前Axes设置为ax,将当前Figure设置为ax的父级。
matplotlib.pyplot.sca(ax)# set current ax
# tight_layout会自动调整子图参数，使之填充整个图像区域。这是个实验特性，可能在一些情况下不工作。它仅仅检查坐标轴标签、刻度标签以及标题的部分。
# 因为当你拥有多个子图时，你会经常看到不同轴域的标签叠在一起。
plt.tight_layout()
# 去掉坐标轴
ax.axis("off")
# 去掉刻度
plt.xticks([])
plt.yticks([])

各种图形画法

散点图

def plot_embedding(data, label, title):
    fig, ax = plt.subplots(figsize=(12,7))
    # 归一化
    x_min, x_max = np.min(data, 0), np.max(data, 0)
    data = (data - x_min) / (x_max - x_min)
    data_s = data[0:-1:3,:]
    data_d = data[2:-1:3,:]
    data_n = data[1:-1:3,:]
    ax.scatter(data_s[:,0],data_s[:,1],c="red",s=3,alpha=0.4,label="S")#label的设置很重要,label是标记了legend
    ax.scatter(data_d[:,0],data_d[:,1],c="green",s=3,alpha=0.4,label="D1")
    ax.scatter(data_n[:,0],data_n[:,1],c="blue",s=3,alpha=0.4,label="D2")
    ax.set_title(title)
    ax.legend()
    return fig

fig = plot_embedding(result, label,'t-SNE embedding of the digits')
plt.show()

PIL

PIL Api

import numpy as np   #可以这样用PIL读取处理图片,然后最后再转成ndarray
from PIL import Image

img = Image.open('label.png')
# 转换成img2之后,此时输出的值是真实的图片的值
# PIL读取的图片不能输出，用np.array()转化会变成单通道的灰度图
# PIL图像在转换为numpy.ndarray后，格式为(h,w,c)，像素顺序为RGB；
# OpenCV在cv2.imread()后数据类型为numpy.ndarray，格式为(h,w,c)，像素顺序为BGR。
img2 = np.array(img)
np.unique(img2)
img.show()
# 在windows使用自带的图像显示程序显示图像，在linux使用display\eog\xv,具体取决于找到哪个
# 直接输出img如下
print(img)
# <PIL.PngImagePlugin.PngImageFile image mode=L size=2048x1024 at 0x1C137643C70>
print(img.format)
# 打印出格式信息，此处输出PNG
print(img.mode)
# L
img3=img.convert("RGB")
# 转换图片的mode
# 将array转换成图片
a = torch.ones((3,2,3))
import PIL.Image as IMG
import numpy as np
a = IMG.fromarray(np.uint8(a))# 必须转换成整数
print(a)# PIL.Image.Image是一个类实例，包含自己的属性,通道顺序是(H,W,C)
# <PIL.Image.Image image mode=RGB size=2x3 at 0x7F51B6B9AF10>
# 转为调色板模式
new_mask = Image.fromarray(mask.astype(np.uint8)).convert('P')
# 模式'L'为灰色图像，它的每个像素用8个bit表示，0表示黑，255表示白，其他数字表示不同的灰度。
# 模式“P”为8位彩色图像，它的每个像素用8个bit表示，其对应的彩色值是按照调色板索引值查询出来的。
# 0-255,每个对应一个颜色。
# 标签图像的模式正是'P'模式，因此测试时要生成对应标签图像的图片的话，构建一个调色板然后上色即可。
new_mask.putpalette(palette)
# 保存图片,以给定的文件名保存此图像.如果未指定格式,则使用的格式由文件扩展名决定(如果可能)。
Image.save(fp, format=None, **params)
img.save('F:/360data/重要数据/桌面/att/050_1.png') #保存
img = img.resize((ow, oh), Image.BILINEAR)
# mask用最近邻插值,直接选最近的数据点赋值
mask = mask.resize((ow, oh), Image.NEAREST)
Image.crop(left, up, right, below)# 切割图片
# left：与左边界的距离 up：与上边界的距离 right：还是与左边界的距离 below：还是与上边界的距离
#旋转图像,逆时针旋转90度,expand默认为0,表示旋转前后尺度不变,从HxW->HxW,expand=1就是HxW->WxH
yourimage.rotate(90,expand=1)

图像的模式,常见的mode 有如下:

jpg都是RGB格式

png是RGBA格式

P模式可把我骗惨了，一个Png的图片,使用PIL.Image读取,可能是P模式的，虽然这个png图片你看着是有颜色的,但是其实读出来是1,2,3这种label。如果你把他转为RGB格式，则会变成对应label变成的对应palette

1:位图 L:灰度模式

三通道RGBnumpy读取显示是0-1，而不是0-255，这个是归一化的RGB！！正好和概率二分类对应！

==使用FastStone查看图片,图像是24bit是3通道的,8bit的是1通道的！==如果是8bit却是彩色值的话,那就说明是调色板模式

Snipaste竟然可以实时获取颜色值！！！如下：

tensor与pil相互转换

# PIL to tensor 
from PIL import Image
import torch
from torchvision import transforms

trans = transforms.Compose([transforms.ToTensor(),
             transforms.Resize(256),
             transform.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))]) 
img = Image.open(img_path)
img = trans(img)
img = torch.unsqueeze(0) # 填充一维

# tensor to PIL
image = im_tensor.cpu().clone()
image = image.squeeze(0) # 压缩一维
image = transforms.ToPILImage()(image) # 自动转换为0-255 并变换通道为HWC

有用的函数

可视化tensor

def tensor2im(input_image, imtype=np.uint8):
    """"
    Parameters:
        input_image (tensor) --  输入的tensor，维度为CHW，注意这里没有batch size的维度
        imtype (type)        --  转换后的numpy的数据类型
    """
    mean = [0.485, 0.456, 0.406] # dataLoader中设置的mean参数，需要从dataloader中拷贝过来
    std = [0.229, 0.224, 0.225]  # dataLoader中设置的std参数，需要从dataloader中拷贝过来
    if not isinstance(input_image, np.ndarray):
        if isinstance(input_image, torch.Tensor): # 如果传入的图片类型为torch.Tensor，则读取其数据进行下面的处理
            image_tensor = input_image.data
        else:
            return input_image
        image_numpy = image_tensor.cpu().float().numpy()  # convert it into a numpy array
        if image_numpy.shape[0] == 1:  # grayscale to RGB
            image_numpy = np.tile(image_numpy, (3, 1, 1))
        for i in range(len(mean)): # 反标准化，乘以方差，加上均值
            image_numpy[i] = image_numpy[i] * std[i] + mean[i]
        image_numpy = image_numpy * 255 #反ToTensor(),从[0,1]转为[0,255]
        image_numpy = np.transpose(image_numpy, (1, 2, 0))  # 从(channels, height, width)变为(height, width, channels)
    else:  # 如果传入的是numpy数组,则不做处理
        image_numpy = input_image
    return image_numpy.astype(imtype)

特征图热力图

    def get_heatmap(self,data):
        data = data[:,:,0]
        # heatmap = cv2.applyColorMap(np.uint8(255*data), cv2.COLORMAP_JET)
        import seaborn as sns
        import matplotlib.pyplot as plt
        # 引入 FigureCanvasAgg
        from matplotlib.backends.backend_agg import FigureCanvasAgg
        plt.clf()
        sns.heatmap(data, center=0)
        plt.axis('off')
        # 引入 Image
        import PIL.Image as Image
        # 将plt转化为numpy数据
        canvas = FigureCanvasAgg(plt.gcf())
        # 绘制图像
        canvas.draw()
        # 获取图像尺寸
        w, h = canvas.get_width_height()
        # 解码string 得到argb图像
        buf = np.fromstring(canvas.tostring_argb(), dtype=np.uint8)
        # 重构成w h 4(argb)图像
        buf.shape = (w, h, 4)
        # 转换为 RGBA
        buf = np.roll(buf, 3, axis=2)
        # 得到 Image RGBA图像对象 (需要Image对象的同学到此为止就可以了)
        image = Image.frombytes("RGBA", (w, h), buf.tobytes())
        # 转换为numpy array rgba四通道数组
        image = np.asarray(image)
        # 转换为rgb图像
        heatmap = image[:, :, :3]

        return heatmap

cv2

opencv的API

基本函数

import cv2
# Loads an image from a file.
cv2.imread(filename[,# Name of file to be loaded.
          flags]# Flag that can take values of cv2::ImreadModes
         )
# 返回值:<class 'numpy.ndarray'>,如果读取文件错误,那么返回空矩阵
# In the case of color images, the decoded images will have the channels stored in B G R order.
# 返回的通道顺序为 H W C

cv2.imwrite(filename, img[, params]    ) 
# 报错,module 'cv2' has no attribute 'imread'
# 版本问题,这个算法被申请了专利，把版本降下去就可以了。依次执行以下操作
pip uninstall opencv-python
pip install opencv-python==3.4.2.16
pip install opencv-contrib-python==3.4.2.16

# Converts an image from one color space to another.
# OpenCV默认颜色格式为BGR
cv.cvtColor(src, code[,# 色彩空间转换代码
            dst[, dstCn]])
image=cv.cvtColor(image,cv.COLOR_BGR2RGB)

R、G、B通道值的常规范围是:

0 to 255 for CV_8U images
0 to 65535 for CV_16U images
0 to 1 for CV_32F images

# 将多通道的array分为几个单通道的array
cv.split(img)
# 将几个单通道的array合并为一个多通道的array
cv.merge((a1,a2,a3))

ColorConvensionCodes

COLOR_BGR2RGB

实用功能小组件

show image and mask

def show_img(img, mask=None):
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
#     img = clahe.apply(img)
#     plt.figure(figsize=(10,10))
    plt.imshow(img, cmap='bone')

    if mask is not None:
        # plt.imshow(np.ma.masked_where(mask!=1, mask), alpha=0.5, cmap='autumn')
        plt.imshow(mask, alpha=0.5)
        handles = [Rectangle((0,0),1,1, color=_c) for _c in [(0.667,0.0,0.0), (0.0,0.667,0.0), (0.0,0.0,0.667)]]
        labels = ["Large Bowel", "Small Bowel", "Stomach"]
        plt.legend(handles,labels)
    plt.axis('off')

直方图均衡教程,包括上图的createCLAHE的自适应局部直方图均衡

Histogram Equalization

import cv2 as cv
import numpy as np
from matplotlib import pyplot as plt
path = "./1.jpeg"
img = cv.imread(path)
print(np.shape(img))
cv.imshow('image',img)
cv.waitKey(0)
cv.destroyAllWindows()
hist,bins = np.histogram(img.flatten(),256,[0,256])
cdf = hist.cumsum()
cdf_normalized = cdf * float(hist.max()) / cdf.max()
plt.plot(cdf_normalized, color = 'b')
plt.hist(img.flatten(),256,[0,256], color = 'r')
plt.xlim([0,256])
plt.legend(('cdf','histogram'), loc = 'upper left')
plt.show()
# equ = cv.equalizeHist(img)
#
R, G, B = cv.split(img)

output1_R = cv.equalizeHist(R)
output1_G = cv.equalizeHist(G)
output1_B = cv.equalizeHist(B)

equ = cv.merge((output1_R, output1_G, output1_B))
cv.imshow('equ.png',equ)
cv.waitKey(0)
cv.destroyAllWindows()

cv2利用sobel算子检测边缘

import cv2
import numpy as np
img = cv2.imread("D:/gui.jpg", 0)
x = cv2.Sobel(img, cv2.CV_16S, 1, 0)
y = cv2.Sobel(img, cv2.CV_16S, 0, 1)
absX = cv2.convertScaleAbs(x) # 转回unit8
absY = cv2.convertScaleAbs(y)
dst = cv2.addWeighted(absX, 0.5, absY, 0.5, 0)
cv2.imshow("absX", absX)
cv2.imshow("absY", absY)
cv2.imshow("Result", dst)
cv2.waitKey(0)
cv2.destroyAllWindows()

plotly

api

import pandas as pd
pd.options.plotting.backend = "plotly"

df = pd.DataFrame(dict(a=[1,3,2], b=[3,2,1]))
fig = df.plot()
fig.show()

jcm

Dch

http://chenghaoDong666.github.io/np-pd-plt/

本博客所有文章除特別声明外，均采用 CC BY 4.0 许可协议。转载请注明来源 Dch !

numpy pandas

pytorch学习

简单记录pytorch学习的一些总结

2020-11-02 python

pytorch

托福笔记

新东方托福笔记学习

2020-10-31 英语

Tofel

numpy、pandas、matplotlib学习

Numpy

axis详解

文件存储npy\npz

常用函数

Pandas

Matplotlib

plg/fig/ax的区别

可视化中间特征图

疑难解惑

1.整数还是浮点

2.cmap问题

基本函数

各种图形画法

PIL

有用的函数

cv2

基本函数

实用功能小组件

plotly

jcm

你的赏识是我前进的动力