Python - Numpy

2018-11-25 本文已影响16人音符纸飞机

NumPy以N维数组作为操作对象进行各种个学计算
也包含了线性代数、傅里叶变换、随机数等

术语

数组的每个维度都称为axis
数组的维度称为rank
每个维度的长度称为shape
数组中元素的个数称为size

创建数组

import numpy as ny

np.zeros(5)
# array([0., 0., 0., 0., 0.])

a = np.zeros((3,4))
a
# array([[0., 0., 0., 0.],
#        [0., 0., 0., 0.],
#        [0., 0., 0., 0.]])
a.shape
# (3, 4)
a.size
# 12
a.ndim
# 2
type(np.zeros((3,4)))
# numpy.ndarray

np.ones((3,4))
# array([[ 1.,  1.,  1.,  1.],
#        [ 1.,  1.,  1.,  1.],
#        [ 1.,  1.,  1.,  1.]])
np.full((3,4), np.pi)
# array([[ 3.14159265,  3.14159265,  3.14159265,  3.14159265],
#        [ 3.14159265,  3.14159265,  3.14159265,  3.14159265],
#        [ 3.14159265,  3.14159265,  3.14159265,  3.14159265]])
np.empty((2,3))
# 生成的数组不会初始化、所以值不可测、当时内存中是多少就是多少
np.array([[1,2,3,4], [10, 20, 30, 40]])
# array([[ 1,  2,  3,  4],
#        [10, 20, 30, 40]])
np.arange(1, 5)
# array([1, 2, 3, 4])
np.arange(1, 5, 0.5)
# 0.5是步长
# array([ 1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5])
print(np.linspace(0, 5/3, 6))
# 6是元素个数， 处理float数组的时候，不要使用arange
# [0.         0.33333333 0.66666667 1.         1.33333333 1.66666667]
np.random,rand(3,4)
# 初始化的值在[0,1]之间平均分布
# array([[0.33300496, 0.21927098, 0.63621247, 0.73141594],
#        [0.25373228, 0.88635854, 0.10527521, 0.34474523],
#        [0.5792484 , 0.96086829, 0.4981987 , 0.04161142]])
np.random.randn(3,4)
# 均值为0，方差为1的正态分布
# array([[-0.46947439,  0.54256004, -0.46341769, -0.46572975],
#        [ 0.24196227, -1.91328024, -1.72491783, -0.56228753],
#        [-1.01283112,  0.31424733, -0.90802408, -1.4123037 ]])

直观感受一下rand和randn的区别

%matplotlib inline
import matplotlib.pyplot as plt
# density=True会使得直方图的每一个柱子上的值进行归一化（通过除以所有柱子的平均值得到）
plt.hist(np.random.rand(100000), density=True, bins=100, histtype="step", color="blue", label="rand")
plt.hist(np.random.randn(100000), density=True, bins=100, histtype="step", color="red", label="randn")
plt.axis([-2.5, 2.5, 0, 1.1])
plt.legend(loc = "upper left")
plt.title("Random distributions")
plt.xlabel("Value")
plt.ylabel("Density")
plt.show()

rand,randn随机数分布

def my_function(z, y, x):
    return x * y + z

np.fromfunction(my_function, (3, 2, 10))

# array([[[  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
#         [  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.]],
# 
#        [[  1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.],
#         [  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.]],
# 
#        [[  2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.],
#         [  2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.]]])

数组元素

# ndarray的每个元素的类型是一样的
c = np.arange(1, 5)
print(c.dtype)
# int32
c = np.arange(1.0, 5.0)
print(c.dtype)
# float64

# 也可以显式设置dtype
d = np.arange(1, 5, dtype=np.complex64)
print(d.dtype, d)
# complex64 [1.+0.j 2.+0.j 3.+0.j 4.+0.j]
# 可用的dtype:
#  int8, int16, int32, int64, uint8|16|32|64, float16|32|64, complex64|128
# https://docs.scipy.org/doc/numpy-1.10.1/user/basics.types.html

e = np.arange(1, 5, dtype=np.complex64)
e.itemsize
# itemsize 返回每个元素所占用的字节
# 8
# 
e.data
# data返回内存地址
# <memory at 0x000001D9D01D5B88>

Reshape

g = np.arange(6)
g.shape = (2,3)
print(g)
# shape改变数组的shape
# [[0 1 2]
#  [3 4 5]]

g2 = g.reshape(3,2)
print(g2)
# reshape返回一个新的数组对象，但是指向同一个data （见上面的e.data）
# [[0 1]
#  [2 3]
#  [4 5]]

print(g)
# [[0 1 2]
#  [3 4 5]]
g2[1, 1] = 999
g2
# array([[  0,   1],
#        [  2, 999],
#        [  4,   5]])
g
# array([[  0,   1,   2],
#        [999,   4,   5]])
g.ravel()
# ravel返回原数组的一维数组，同样也指向原数组的内存地址
# array([  0,   1,   2, 999,   4,   5])

代数运算

a = np.array([14, 23, 32, 41])
b = np.array([5,  4,  3,  2])
a + b
a - b
a * b
a / b
a // b #求商
a % b # 求余数
a ** b # 求幂

Broadcasting

一般来说，当NumPy期望相同形状的数组，但发现情况并非如此时，它应用所谓的广播规则
为了定义两个形状是否是可兼容的，Numpy从最后开始往前逐个比较它们的维度（dimensions）大小。比较过程中，如果两者的对应维度相同，或者其中之一（或者全是）等于1，比较继续进行直到最前面的维度。否则报错。当其中之一的形状的维度超出范围（维度的下标），Numpy将会使用1进行比较直到另一个也超出dim范围。一旦Numpy确定两者的形状是可兼容的，最终结果的形状就成了每个维度上取两者之间最大的形状尺寸。

k = np.arange(6).reshape(2, 3)
k
# array([[0, 1, 2],
#        [3, 4, 5]])
h = [100, 200, 300]
k + h
# array([[100, 101, 102],
#        [203, 204, 205]])
'''
k的shape是(2,3)
h的shape是(3,)
h -> (1, 3) -> (2, 3) -> [[100, 200, 300], [100, 200, 300]]
'''

条件运算

m = np.array([20, -5, 30, 40])
m < [15, 16, 35, 36]
# array([False,  True,  True, False])
m < 25
# array([ True,  True, False, False])
m[m < 25]
# array([20, -5])

数理统计

a = np.array([[-2.5, 3.1, 7], [10, 11, 12]])
a.mean()
a.min()
a.max()
a.prod() # 所有元素的乘积
a.std() # 标准差
a.var() # 方差
# 可以指定axis，对其进行相应的计算

通用函数 Universal Functions

a = np.array([[-2.5, 3.1, 7], [10, 11, 12]])
np.square(a)
# array([[   6.25,    9.61,   49.  ],
#        [ 100.  ,  121.  ,  144.  ]])
'''
np.abs, np.sqrt, np.exp, np.log, 
np.sign,  # 元素的符号（正负）
np.ceil, 
np.modf, # 返回小数部分和整数部分两个ndarray
np.isnan, np.cos
'''

数组下标

# 数组切片是原数组的一个视图
a = np.array([1, 5, 3, 19, 13, 7, 3])
# 这样才是新开辟了一块内存
another_slice = a[2:6].copy()

b = np.arange(48).reshape(4, 12)
'''
Fancy indexing， 返回的不是视图
'''
b[(0,2), 2:5]
b[:, (-1, 2, -1)] 

'''
Boolean indexing
'''
b = np.arange(48).reshape(4, 12)
rows_on = np.array([True, False, True, False])
b[rows_on, :]  # Rows 0 and 2, all columns. Equivalent to b[(0, 2), :]
cols_on = np.array([False, True, False] * 4)
b[:, cols_on]  # All rows, columns 1, 4, 7 and 10
'''
np.ix_  #TODO
'''

迭代Iterating

c = np.arange(6).reshape(2, 3) 
# flat相当于所有元素ravel之后的下标
for i in c.flat:
    print("Item:", i)
'''
Item: 0
Item: 1
Item: 2
Item: 3
Item: 4
Item: 5
'''

数组叠加

垂直叠加 np.vstack(a,b)
水平叠加 np.hstack(a,b)
在新的axis上叠加 np.stack(a,b) 必须保证所有数组的shape是一致的

拆分数组

垂直切分 np.vsplit(r, 3) 垂直切分为三块
水平切分 p.hsplit(r, 2)

转置

t = np.arange(24).reshape(4,2,3)
t1 = t.transpose((1,2,0))
# the axes 0, 1, 2 (depth, height, width) are re-ordered to 1, 2, 0 (depth→width, height→depth, width→height):
t2 = t.transpose()  
t2 = t.T
# equivalent to t.transpose((2, 1, 0))
t3 = t.swapaxes(0,1)  
# equivalent to t.transpose((1, 0, 2))

m2 = np.arange(5)
# array([0, 1, 2, 3, 4])
m2.T
# array([0, 1, 2, 3, 4])

# 转置一维数组要这样做
m2r = m2.reshape(1,5)
# array([[0, 1, 2, 3, 4]])
m2r.T
# array([[0],
#        [1],
#        [2],
#        [3],
#        [4]])

线性代数

######点乘

n1 = np.arange(10).reshape(2, 5)
n2 = np.arange(15).reshape(5,3)
n1.dot(n2)
n1 @ n2   #新的表示方式

######逆

import numpy.linalg as linalg

m3 = np.array([[1,2,3],[5,7,11],[21,29,31]])
linalg.inv(m3)
linalg.pinv(m3) # Calculate the generalized inverse of a matrix using its singular-value decomposition (SVD) and including all *large* singular values.

######单位矩阵

np.eye(3)

######QR分解
q, r = linalg.qr(m3)

######特征值、特征向量
#m3.v - λ*v = 0
eigenvalues, eigenvectors = linalg.eig(m3)

######奇异值分解 
# U.Σ.V == m4
m4 = np.array([[1,0,0,0,2], [0,0,3,0,0], [0,0,0,0,0], [0,2,0,0,0]])
U, S_diag, V = linalg.svd(m4)

###### Diagonal 对角线上的元素
np.diag(m3)
###### trace对角线上元素之和
np.trace(m3)

###### 求解线性标量方程组
# 2x + 6y = 6
# 5x + 3y = -9

coeffs  = np.array([[2, 6], [5, 3]])
depvars = np.array([6, -9])
solution = linalg.solve(coeffs, depvars)
solution

向量化

坐标矩阵

import math
data = np.empty((768, 1024))
for y in range(768):
    for x in range(1024):
        data[y, x] = math.sin(x*y/40.5)  # BAD! Very inefficient.

x_coords = np.arange(0, 1024)  # [0, 1, 2, ..., 1023]
y_coords = np.arange(0, 768)   # [0, 1, 2, ..., 767]
X, Y = np.meshgrid(x_coords, y_coords)
data = np.sin(X*Y/40.5)

数组的保存和加载

a = np.random.rand(2,3)
np.save("my_array", a)
a_loaded = np.load("my_array.npy")


###text format
np.savetxt("my_array.csv", a, delimiter=",")
a_loaded = np.loadtxt("my_array.csv", delimiter=",")

###zipped format (.npz)
np.savez("my_arrays", my_a=a, my_b=b)
my_arrays = np.load("my_arrays.npz")
my_arrays.keys()
my_arrays["my_a"]