A = np.array([[1,2],[3,4],[5,6]])
B = np.array([[7,8],[9,10]])
np.dot(A, B)
array([[ 25, 28],
[ 57, 64],
[ 89, 100]])
Or,
np.matmul(A,B)
array([[ 25, 28],
[ 57, 64],
[ 89, 100]])
Also,
A@B
array([[ 25, 28],
[ 57, 64],
[ 89, 100]])
Transpose operator .T
can be used together to compute
A@B.T # 默认先转置,再做矩阵相乘
array([[ 23, 29],
[ 53, 67],
[ 83, 105]])
The numpy.outer()
function computes the outer product of two 1d-arrays, two row vectors, or two column vectors:
a = np.array([1,2,3,4])
b = np.array([10,20,30,40])
print(np.outer(a,b))
[[ 10 20 30 40]
[ 20 40 60 80]
[ 30 60 90 120]
[ 40 80 120 160]]
np.outer(a.reshape(1,4),b.reshape(1,4))==np.outer(a.reshape(4,1),b.reshape(4,1))
array([[ True, True, True, True],
[ True, True, True, True],
[ True, True, True, True],
[ True, True, True, True]])
The numpy.inner()
function computes the inner product of two 1d-arrays or two row vectors:
# 当a和b均为单维数组
print(np.inner(a,b))
# 当a和b均为行向量的双维数组
print(np.inner(a.reshape(1,4),b.reshape(1,4)))
300
[[300]]
numpy.dot
and numpy.matmul
are 1d-arrays,np.dot(a,b), np.matmul(a,b), a@b
(300, 300, 300)
numpy.vdot()
flattens 2d-arrays provided as input into 1d-arrays.
a_mat = np.array([[1,2],[3,4]])
b_mat = np.array([[10,20],[30,40]])
np.vdot(a_mat,b_mat)
300
numpy.random.rand()
(frequently used in the future) can create an array of the given shape and populates it with random samples from a uniform distribution over np.random.rand(3)
array([0.74744715, 0.81168166, 0.78421755])
Or a matrix of random uniform samples:
np.random.rand(2,2)
array([[0.49294625, 0.36978829],
[0.64566554, 0.53672976]])
a = np.array([1,2])
b = a
print(id(a)==id(b))
True
Modifying b
will get reflected in a
b.shape = 2,1
a,b
(array([[1],
[2]]),
array([[1],
[2]]))
However, redefining b
does not affect a
:
a = np.array([1,2])
b = a
b = b+1
a,b
(array([1, 2]), array([2, 3]))
hstack()
and vstack()
functions can append arrays horizontally and vertically.# 生成1到12的整数并构造成3 by 4的矩阵
A = np.arange(1,13).reshape(3,4)
# 生成13到18的整数并构造成3 by 2的矩阵
B = np.arange(13,19).reshape(3,2)
# 生成20到27的整数并构造成2 by 4的矩阵
C = np.arange(20,28).reshape(2,4)
(A,B,C)
(array([[ 1, 2, 3, 4],
[ 5, 6, 7, 8],
[ 9, 10, 11, 12]]),
array([[13, 14],
[15, 16],
[17, 18]]),
array([[20, 21, 22, 23],
[24, 25, 26, 27]]))
np.hstack((A,B,B)) # 把矩阵A,B,B沿着相同行数垒在一起
array([[ 1, 2, 3, 4, 13, 14, 13, 14],
[ 5, 6, 7, 8, 15, 16, 15, 16],
[ 9, 10, 11, 12, 17, 18, 17, 18]])
np.vstack((A,C,C)) # 把矩阵A,C,C沿着相同列数垒在一起
array([[ 1, 2, 3, 4],
[ 5, 6, 7, 8],
[ 9, 10, 11, 12],
[20, 21, 22, 23],
[24, 25, 26, 27],
[20, 21, 22, 23],
[24, 25, 26, 27]])
hstack
and vstack
can be reproduced using the numpy.concatenate((a1, a2, ...), axis=0)
function.axis
along which the arrays will be stacked.
axis=1
for the horizontal axis, andaxis=0
for the vertical axis.np.concatenate((A,B,B), axis=1)
array([[ 1, 2, 3, 4, 13, 14, 13, 14],
[ 5, 6, 7, 8, 15, 16, 15, 16],
[ 9, 10, 11, 12, 17, 18, 17, 18]])
Repeat the same array, the numpy.tile()
function can be a convenient option.
D = np.arange(1,7).reshape(2,3)
np.tile(D,(3,1)), np.tile(D,(1,3)), np.tile(D,(3,2))
(array([[1, 2, 3],
[4, 5, 6],
[1, 2, 3],
[4, 5, 6],
[1, 2, 3],
[4, 5, 6]]),
array([[1, 2, 3, 1, 2, 3, 1, 2, 3],
[4, 5, 6, 4, 5, 6, 4, 5, 6]]),
array([[1, 2, 3, 1, 2, 3],
[4, 5, 6, 4, 5, 6],
[1, 2, 3, 1, 2, 3],
[4, 5, 6, 4, 5, 6],
[1, 2, 3, 1, 2, 3],
[4, 5, 6, 4, 5, 6]]))
Repeat the same array, the numpy.tile()
function can be a convenient option.
D = np.arange(1,7).reshape(2,3)
np.tile(D,(3,1)), np.tile(D,(1,3)), np.tile(D,(3,2))
(array([[1, 1, 1, 2, 2, 2, 3, 3, 3],
[4, 4, 4, 5, 5, 5, 6, 6, 6]]),
array([[1, 2, 3],
[1, 2, 3],
[1, 2, 3],
[4, 5, 6],
[4, 5, 6],
[4, 5, 6]]),
array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]))
Generate the data:
# 为了随机生成的数字可以被重复,通常给定随机种子的编号
np.random.seed(1212)
# 随机从分布uniform~[0,1)当中获取4 by 3的矩阵
x = np.random.rand(4,3)
We compute the following statistics
mean(x)
returns the sample mean
std(x)
returns the standard deviation
sum(x)
returns the summation
amin(x)
and amax(x)
returns the minimum value and maximum value
ptp(x)
returns the range (maxima minus minima of `x`)
percentile(x,q)
returns the q-th percentile
print("Mean=%f \nStd. Dev.=%f \nSum=%f \nMin=%f \nMax=%f \nRange=%f \nMedian=%f"
% (np.mean(x), np.std(x), np.sum(x),
np.amin(x), np.amax(x),
np.ptp(x), np.percentile(x,50)))
Mean=0.512636
Std. Dev.=0.311660
Sum=6.151627
Min=0.085679
Max=0.969521
Range=0.883842
Median=0.503443
for
-loopdef forloop(n):
sig = 0 # 初始值为0
for i in range(n): # 循环n范围的值
sig = float(i)*float(i) + sig #每次对sig循环赋值
return(sig)
def listcomp(n):
return(sum([float(x)*x for x in range(n)]))
def numpymethod(n):
return(np.sum(np.arange(0, n, dtype='d')**2))
It is important to always check the functions before implementing the main program. In this case,
res1 = forloop(1000)
res2 = listcomp(1000)
res3 = numpymethod(1000)
res1, res2, res3
(332833500.0, 332833500.0, 332833500.0)
In order to track the speed of every method, we can incorporate a timer using the built-in time
module.
import time # 引入 time 模块
def timer(f, args): # 定义时长方程Timer,函数第一位置为目标方程,第二位置为次数
starttime = time.time() # Starting time 定义开始时间
y = f(*args) # Tuple arg as input argument
return(time.time() - starttime)
The main program starts from here:
n = 1000000
forloop_time = timer(forloop,(n,))
listcomp_time = timer(listcomp,(n,))
numpy_time = timer(numpymethod,(n,))
print("n is set to be %d" % n)
print("for-loop takes %6.5f seconds." % forloop_time)
print("List comprehension takes %6.5f seconds." % listcomp_time)
print("NumPy takes %6.5f seconds." % numpy_time)
n is set to be 1000000
for-loop takes 0.22419 seconds.
List comprehension takes 0.18575 seconds.
NumPy takes 0.01047 seconds.
n = 500
K = 4
np.random.seed(12345)
X = np.random.rand(n,K)
c = np.arange(K)+10
def doublefor():
Sum = np.zeros([1,K])
for i in range(n):
for j in range(n):
for k in range(K):
Sum[:,k] = Sum[:,k]+(X[i,k]-c[k])*(X[j,k]-c[k])
return Sum
starttime = time.time()
[doublefor() for i in range(10)]
endtime = time.time()-starttime
print("for-loop takes %6.5f seconds:" % endtime)
print("double sum is",doublefor() )
for-loop takes 25.01792 seconds:
double sum is [[22519883.25492541 27621066.13247766 33021514.42390825 39079057.04898941]]
The code presented below runs much faster than the doublefor
function:
def matcomp3d():
M = np.zeros([n,1,K])
L = np.zeros([n,n,K])
Sum = np.zeros([K])
for k in range(K):
M[:,:,k] = (X[:,k]-c[k]).reshape(n,1)
L[:,:,k] = np.dot(M[:,:,k],M[:,:,k].T)
Sum[k] = np.sum(L[:,:,k])
return Sum
starttime = time.time()
[matcomp3d() for i in range(10)]
endtime = time.time()-starttime
print("matrix computation takes %6.5f seconds:" % endtime)
print("double sum is",doublefor() )
matrix computation takes 0.05578 seconds:
double sum is [[22519883.25492541 27621066.13247766 33021514.42390825 39079057.04898941]]
Series
andDataFrame
.Series
is made up of an index and its corresponding data values.DataFrame
encapsulates a Series that extends to two dimensions.Series
and DataFrame
import pandas as pd
contents = np.array([1,5,np.nan,6])
# Panda Series
pd.Series(contents)
# Panda DataFrame
pd.DataFrame(contents)
We can define names or labels for both rows and columns in a DataFrame
:
df = pd.DataFrame([1,5,np.nan,6],
index=['row1','row2','row3','row4'],
columns=['col1'])
df
Please note that
DataFrame
.To view the labels:
df.index, df.columns
(Index(['row1', 'row2', 'row3', 'row4'], dtype='object'),
Index(['col1'], dtype='object'))
DataFrame
DataFrame can be transposed using .T
method as if operating a matrix.
df.T # 转置
Its summary statistic can be readily displayed using describe
method:
df.describe().T
A different method of defining a DataFrame
involves using dictionaries:
data = {'Name':['Tom','Mary','John','Bill'],
'Age':[20,21,19,18]}
pd.DataFrame(data)
One can select part of the dictionary to construct a DataFrame
:
data = {'Name':['Tom','Mary','John','Bill'],
'Class':['I','I','II','II'],
'Age':[20,21,19,18],
'GPA':[4.1,3.2,4.0,3.8]}
pd.DataFrame(data, columns=['Name','GPA'],
index=['001','002','003','004'])
Alternatively, it is possible to define an empty DataFrame
and then fill in the columns:
import random
# 生成一个空DataFrame
df = pd.DataFrame()
# 添加每一列的内容
df['ID'] = np.arange(1,5,1)
df['Random'] = random.sample(range(1,1000),4)
df['Gender'] = ['Male','Female','Male','Male']
df['Male'] = [1,0,1,1]
df
To create a Dataframe
in which observations are indexed by time periods, we can start with generating a date/time object (using pandas.date_range
function):
dates = pd.date_range('20220401', periods=3, freq="W")
dates
DatetimeIndex(['2022-04-03', '2022-04-10', '2022-04-17'], dtype='datetime64[ns]', freq='W-SUN')
Then we pass this date object as the argument for an index of the DataFrame
.
df = pd.DataFrame(np.random.randn(3,4), index=dates,
columns=list('ABCD'))
DataFrames
, they can often become large.head(#)
and tail(#)
functions, you can easily browse the top and bottom rows of the DataFrame
.df.head(2)
Also, we can apply the .loc
method as follows. Let us see the example:
df.loc['20220403':'20220410',['A', 'B']]
.loc
is called .iloc
.
df.iloc[[0,1],[0,1]]
.at
method works only with column and row labels.DataFrame
can be selected according to the logical conditions.df[df >= 0]
In the below code:
DataFrame
df
is passed to melt()
function.id_vars
is the variable that needs to be left unaltered, which is countries
.var_name
are the column names.value_name
are its values.df2 = pd.melt(df,id_vars=['Name'], var_name='Variables',
value_name='Values')
pivot()
:df2.pivot(index='Name', columns='Variables', values='Values')
csv
format).xlsx
format).Now we use the pandas.read_csv()
function to import the CSV formatted dataset.
data_pd = pd.read_csv("dependency/data_wageedu.csv")
# 显示数据的前5行
data_pd.head(5)
logwage
and edu
, and is regarded as invalid numerical values.delimiter
option is to set the string used to separate values in the text file.delimiter=","
.data_np = np.genfromtxt("dependency/data_wageedu.csv",
delimiter=",")
# 显示数据的前5行
data_np[:5,:]
array([[ nan, nan],
[ 2.868216, 12. ],
[ 2.358269, 7. ],
[ 2.732919, 12. ],
[ 2.416616, 10. ]])
pandas.read_excel()
function can help.sheet_name
option allows users to specify the particular sheet to import.data2_pd = pd.read_excel("dependency/data_lifesat.xls",
sheet_name="Sheet1")
# 显示数据的前5行
data2_pd.head(5)
DataFrame
as a CSV file,
pandas.DataFrame.to_csv()
function.pandas.DataFrame.to_excel()
# 将NumPy数组转换为DataFrame
df = pd.DataFrame(data_np)
# 将DataFrame存为CSV文件
df.to_csv("data_df.csv")
savetext()
function.data_np.csv
.np.savetext("data_np.csv", data_np,
delimiter = ",")
DataFrames
.DataFrame
variable.# 配置Python-Stata交互
import os
os.chdir('/Applications/Stata/utilities')
from pystata import config
config.init('mp')
# Stata中读取auto.dta数据
from pystata import stata
stata.run('''sysuse auto, clear''')
# 将Stata当前数据传输到Python
auto_pd = stata.pdataframe_from_data()
auto_pd.head()
(1978 automobile data)
DataFrames
can be sent to Stata.DataFrame
auto_pd
into Stata.stata.pdataframe_to_data(auto_pd, force=True)
stata.run('list in 1/2')
+------------------------------------------------------------------------+
1. | make | price | mpg | rep78 | headroom | trunk | weight | length |
| AMC Concord | 4099 | 22 | 3 | 2.5 | 11 | 2930 | 186 |
|------------------------------------------------------------------------|
| turn | displa~t | gear_ra~o | foreign |
| 40 | 121 | 3.5799999 | 0 |
+------------------------------------------------------------------------+
+------------------------------------------------------------------------+
2. | make | price | mpg | rep78 | headroom | trunk | weight | length |
| AMC Pacer | 4749 | 17 | 3 | 3 | 11 | 3350 | 173 |
|------------------------------------------------------------------------|
| turn | displa~t | gear_ra~o | foreign |
| 40 | 258 | 2.53 | 0 |
+------------------------------------------------------------------------+