Python主要数据读写方式

python常用的训练数据的格式读写汇总

HDF5

HDF5的特点:

  • 易于读取
  • 类似于mat数据,但数据压缩性能更强
  • 需要全部读进内存里,故HDF5文件大小不能超过内存,可以分成多个HDF5文件,将HDF5子文件路径写入txt中.
  • I/O速率不如LMDB.

小数据读写

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import h5py
import numpy as np
# create and writing HDF5 file
imgsData = np.zeros((10, 3, 224, 224))
labels = range(10)
f = h5py.File('HDF5_FILE.h5', 'w')
f['data'] = imgsData # f.create_dataset('data', data = imgsData)
f['label'] = labels # f.create)dataset('label', data = labels)
f.close()
# reading HDF5 file
f = h5py.File("HDF5_FILE.h5", "r")
for key in f.keys():
print(f[key].name)
print(f[key].shape)
print(f[key].value)
f_keys = f.keys()
imgsData = f['data'][:]
labels = f['label'][:]
f.close()

大数据读写(超过内存)

打开文件头与文件中的数据头
预留存储空间
数据指定位置赋值

- mode
r Read only, file must exist
r+ read/write, file must exit
w create file, truncate if exists
w- or x create file, fail if exists
a read/write if exists, create otherwise(defualt)

脚本在第一次保存时设置为创建模式,之后调整为追加模式。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import sys
import h5py
import numpy as np
def save_h5(times=0):
if times == 0:
h5f = h5py.File('data.h5', 'w')
dataset = h5f.create_dataset("data", (100, 1000, 1000),
maxshape=(None, 1000, 1000),
# chunks=(1, 1000, 1000),
dtype='float32')
else:
h5f = h5py.File('data.h5', 'a')
dataset = h5f['data']
# 关键:这里的h5f与dataset并不包含真正的数据,
# 只是包含了数据的相关信息,不会占据内存空间
#
# 仅当使用数组索引操作(eg. dataset[0:10])
# 或类方法.value(eg. dataset.value() or dataset.[()])时数据被读入内存中
a = np.random.rand(100, 1000, 1000).astype('float32')
# 调整数据预留存储空间(可以一次性调大些)
dataset.resize([times*100+100, 1000, 1000])
# 数据被读入内存
dataset[times*100:times*100+100] = a
# print(sys.getsizeof(h5f))
h5f.close()
def load_h5():
h5f = h5py.File('data.h5', 'r')
data = h5f['data'][0:10]
# print(data)
if __name__ == '__main__':
# save_h5(0)
for i in range(20):
save_h5(i)
# 部分数据导入内存
load_h5()

csv

xls或xlsx

最近帮女票倒腾了一些数据,涉及到excel表的读、写、修改
具体参考了以下几个链接:
第一个链接介绍了xls的读写修改方法,修改方法我没有成功运行,原因是缺了一个模块
第二个链接介绍了把csv和xlsx按行读成list,对于python处理来说比较实用
第三个链接详细介绍了OpenPyXL库的使用方法
第四个链接非常详细的介绍了用python来处理excel文件的方法

由于好久没用python处理excel,很多命令生疏了,我把工作分成两部分,
第一部分:读入excel处理数据得到结果保存为xls
第二部分:把得到的结果填到另一张表中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 9 21:06:18 2019
@author: sttomato
"""
import xlrd
import xlwt
## 数据读取,这个表有74万行,读入时间较长
workbook = xlrd.open_workbook("TMT_FigureInfo.xlsx")
table = workbook.sheets()[0]
nrows = table.nrows
dataset = []
for row in range(nrows):
dataset.append(table.row_values(row))
## 数据处理
cnt = 0
total = 0
years = [str(i) for i in range(2001, 2018)] # 处理从2001到2017年的sheet
#years = ['2001']
for year in years:
#把对应的内容以字典形式保存
dic = {}
for data in dataset[3:]:
total = total + 1
comp = data[0]
date = data[1]
if comp not in dic:
dic[comp] = 0
if year not in date:
continue
oversea = data[27].split(',')
flag = int(data[24])
if '1' in oversea or '2' in oversea:
dic[comp] += 1 * flag
cnt = cnt + 1
print(comp, data[27], flag)
print("===========")
print(cnt)
print(total)
# 把对应的key-value写入新的xls
head = ['company code', 'numbers']
book = xlwt.Workbook()
sheet = book.add_sheet(year)
col = 0
for item in head:
sheet.write(0, col, item)
col += 1
for i, key in enumerate(dic.keys()):
sheet.write(i + 1, 0, key)
sheet.write(i + 1, 1, dic[key])
book.save("{}_result.xls".format(year))
dic.clear()

第二部分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 10 09:08:07 2019
@author: sttomato
"""
import xlrd
from openpyxl import load_workbook
ori_book = load_workbook('deep_cross_agent_info.xlsx')
#year = '2001'
years = [str(i) for i in range(2001, 2018)]
for year in years:
sheet_ranges = ori_book[year]
rows = sheet_ranges.rows
#####################把之前保存的表读入字典######################
root_oversea = 'overseaback_board/'
path_oversea = root_oversea + year + "_result.xls"
workbook = xlrd.open_workbook(path_oversea)
table = workbook.sheets()[0]
nrows = table.nrows
dic_oversea = {}
for row in range(1, nrows):
comp_code = table.row_values(row)[0]
board_num = int(table.row_values(row)[1])
dic_oversea[comp_code] = board_num
#####################把之前保存的表读入字典######################
root_nati_board = 'nationality_board/'
path_nati_board = root_nati_board + year + "_result.xls"
workbook = xlrd.open_workbook(path_nati_board)
table = workbook.sheets()[0]
nrows = table.nrows
dic_nati_board = {}
for row in range(1, nrows):
comp_code = table.row_values(row)[0]
board_num = int(table.row_values(row)[1])
dic_nati_board[comp_code] = board_num
#####################把之前保存的表读入字典######################
root_nati_inde = 'nationality_indepnt/'
path_nati_inde = root_nati_inde + year + "_result.xls"
workbook = xlrd.open_workbook(path_nati_inde)
table = workbook.sheets()[0]
nrows = table.nrows
dic_nati_inde = {}
for row in range(1, nrows):
comp_code = table.row_values(row)[0]
board_num = int(table.row_values(row)[1])
dic_nati_inde[comp_code] = board_num
#######根据字典的key匹配表中的某一行,再把字典value填到对应位置#####
for i, row in enumerate(rows):
num = i + 1
coord_comp = 'A{}'.format(num)
#coord_board_num = 'N{}'.format(num)
comp_code = sheet_ranges[coord_comp].value # 取值
if comp_code in dic_oversea:
sheet_ranges['N{}'.format(num)] = dic_oversea[comp_code] # 赋值
if comp_code in dic_nati_board:
sheet_ranges['G{}'.format(num)] = dic_nati_board[comp_code]
if comp_code in dic_nati_inde:
sheet_ranges['K{}'.format(num)] = dic_nati_inde[comp_code]
print(num, comp_code)
ori_book.save('temp.xlsx')

numpy库

有缺失值就扑街

1
2
3
4
5
6
7
import numpy
# read
my_matrix = numpy.loadtxt(open("1.csv","rb"),delimiter=",",skiprows=0)
# write
numpy.savetxt('2.csv', my_matrix, delimiter = ',')

pandas库

1
2
3
4
5
6
7
8
9
10
11
import pandas as pd
# read
df = pd.read_csv('D:\Python\\l\B_train1.csv')
df.values
df.as_matrix(columns=None)
# write
testB = test[test.intersection_id.isin(["B"])]
pd.DataFrame.to_csv(testB,"D:\Python\\k\\t5B.csv")
#testB是个dataFrame

sklearn库中的Imputer

1
2
3
4
5
6
>>> m = df.as_matrix(columns=None)
>>> from sklearn.preprocessing import Imputer
>>> imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
>>> imp.fit(m)
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
>>> imp.transform(m)

mat

1
2
3
4
5
6
7
8
9
10
11
12
import scipy.io as scio
# load
dataFile = 'data.mat'
data = scio.loadmat(dataFile)
print(type(data)) # <type 'dict'>
print(type(data['A'])) # <type 'numpy.ndarray'>
# save
dataNew = 'dataNew.mat'
scio.savemat(dataNew, {'A': data['A']})

pkl

txt

LMDB

LMDB格式的优点:

  • 基于文件映射IO(memory-mapped),数据速率更好
  • 对大规模数据集更有效.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import numpy as np
import lmdb
import caffe
####################### create and writing ##############################
lmdb_file = '/path/to/data_lmdb'
N = 1000
# 准备 data 和 labels
X = np.zeros((N, 3, 224, 224), dtype=np.uint8) # data
y = np.zeros(N, dtype=np.int64) # labels
env = lmdb.open(lmdb_file, map_size=int(1e12))
txn = env.begin(write=True)
for i in range(N):
datum = caffe.proto.caffe_pb2.Datum()
datum.channels = X.shape[1]
datum.height = X.shape[2]
datum.width = X.shape[3]
datum.data = X[i].tobytes() # or .tostring() if numpy < 1.9
datum.label = int(y[i])
# 以上五行也可以直接: datum = caffe.io.array_to_datum(data, label)
str_id = '{:08}'.format(i)
txn.put(str_id, datum.SerializeToString())
# in Python3
# txn.put(str_id.encode('ascii'), datum.SerializeToString())
####################### reading ##############################
env = lmdb.open('data_lmdb', readonly=True)
txn = env.begin()
lmdb_cursor = txn.cursor()
datum = caffe.proto.caffe_pb2.Datum()
for key, value in lmdb_cursor:
print '{},{}'.format(key, value)
datum.ParseFromString(value)
flat_data = np.fromstring(datum.data, dtype=np.uint8)
data = flat_data.reshape(datum.channels, datum.height, datum.width)
# 或 data = caffe.io.datum_to_array(datum)
labels = datum.label

tfrecord