github
其实如果不适用一些可视化工具解析parquet文件,不太好看parquet文件内部正常应该是什么样的。但是使用一些可视化工具的话,可以发现,parquet文件会像表格,如excel文件,csv文件那样,排列数据。通过结构体写入的时候也和写入csv文件很像,写一次就是写一行。
go get github.com/xitongsys/parquet-go
type Student struct {
Name string `parquet:"name=name, type=BYTE_ARRAY, convertedtype=UTF8, encoding=PLAIN_DICTIONARY"`
Age int32 `parquet:"name=age, type=INT32, encoding=PLAIN"`
Id int64 `parquet:"name=id, type=INT64"`
Weight float32 `parquet:"name=weight, type=FLOAT"`
Sex bool `parquet:"name=sex, type=BOOLEAN"`
Day int32 `parquet:"name=day, type=INT32, convertedtype=DATE"`
Ignored int32 //without parquet tag and won't write
}
这里的结构体中,如果有切片类型的属性,就要使用valuetype标签,因为源码中会判断属性的类型
type Acc struct {
Accel []float32 `parquet:"name=accel, valuetype=FLOAT" json:"accel,omitempty"`
Gyro []float32 `parquet:"name=gyro, valuetype=FLOAT" json:"gyro,omitempty"`
Comp []float32 `parquet:"name=comp, valuetype=FLOAT" json:"comp,omitempty"`
}
func TestParquetExp(t *testing.T) {
var err error
w, err := os.Create("./flat.parquet")
if err != nil {
log.Println("Can't create local file", err)
return
}
//write
pw, err := writer.NewParquetWriterFromWriter(w, new(Student), 4)//使用4个协程,并发写
if err != nil {
log.Println("Can't create parquet writer", err)
return
}
pw.RowGroupSize = 128 * 1024 * 1024 //128M
pw.CompressionType = parquet.CompressionCodec_SNAPPY
num := 100
for i := 0; i < num; i++ {
stu := Student{
Name: "StudentName",
Age: int32(20 + i%5),
Id: int64(i),
Weight: float32(50.0 + float32(i)*0.1),
Sex: bool(i%2 == 0),
Day: int32(time.Now().Unix() / 3600 / 24),
}
if err = pw.Write(stu); err != nil { //写了100次,也就是100行
log.Println("Write error", err)
}
}
if err = pw.WriteStop(); err != nil {
log.Println("WriteStop error", err)
return
}
log.Println("Write Finished")
w.Close()
///read
fr, err := local.NewLocalFileReader("./flat.parquet")
if err != nil {
log.Println("Can't open file")
return
}
pr, err := reader.NewParquetReader(fr, new(Student), 4)
if err != nil {
log.Println("Can't create parquet reader", err)
return
}
num = int(pr.GetNumRows()) //写了100行,这里就返回100
for i := 0; i < num; i++ {
//if i%2 == 0 {
// pr.SkipRows(10) //skip 10 rows 0-9,20-29,40-49,60-69,80-89被跳过
// continue
//}
stus := make([]Student, 2) //这里的切片长度代表一次都多少行
if err = pr.Read(&stus); err != nil {
log.Println("Read error", err)
}
log.Println(stus)
}
pr.ReadStop()
fr.Close()
}
func NewSchemaHandlerFromStruct(obj interface{}) (sh *SchemaHandler, err error) {
defer func() {
if r := recover(); r != nil {
switch x := r.(type) {
case string:
err = errors.New(x)
case error:
err = x
default:
err = errors.New("error occurred")
}
}
}()
ot := reflect.TypeOf(obj).Elem() //获取传入的obj的类型(结构体?切片?)
item := NewItem() //该方法中主要使用的实例
item.GoType = GoType //给item的info属性的属性赋值
item.Info.InName = "Parquet_go_root" //给item的info属性的属性赋值
item.Info.ExName = "parquet_go_root" //给item的info属性的属性赋值
item.Info.RepetitionType = parquet.FieldRepetitionType_REQUIRED //给item的info属性的属性赋值
stack := make([]*Item, 1) //该数组应该是存放obj有几个属性
stack[0] = NewItem //一开始,item是空的,stack也只有一个item
schemaElements := make([]*parquet.SchemaElement, 0) //存放模板元素的切片
infos := make([]*common.Tag, 0) //应该是每一个属性的信息
for len(stack) > 0 { //stack刚开始长度为1时,主要是在准备将一些东西放进去,如果是obj是结构体,就是要把obj的属性封装为*Item类型的实例放进去,用来处理。之后才是真正处理stack里面的内容
ln := len(stack)
item = stack[ln-1]
stack = stack[:ln-1] //1.对于刚开始进入该循环的时候,这里的作用是将stack置空,用来之后存放封装了obj的属性的item 2.之后是真正的便开始处理每个obj的属性
var newInfo *common.Tag
if item.GoType.Kind() == reflect.Struct { //obj如果是结构体类型
schema := parquet.NewSchemaElement() //表示模板中的元素。
schema.Name = item.Info.InName
schema.RepetitionType = &item.Info.RepetitionType
numField := int32(item.GoType.NumField()) //obj中的属性个数
schema.NumChildren = &numField
schemaElements = append(schemaElements, schema) //放一个模板元素进去
newInfo = common.NewTag()
common.DeepCopy(item.Info, newInfo) //item这个时候其实只有类型(GoType里面的东西)与传入的obj相关,并且复制了一个新的info
infos = append(infos, newInfo) //把新的info存上
for i := int(numField - 1); i >= 0; i-- { //开始遍历obj的属性
f := item.GoType.Field(i) //获取obj的属性
tagStr := f.Tag.Get("parquet") //获取属性的parquet 标签
//ignore item without parquet tag
if len(tagStr) <= 0 { //没有qarquet标签就跳过,忽略
numField--
continue
}
newItem := NewItem()
//把字符标签“name=comp, type=FLOAT”,转换为*Tag类型的实例(以“,”拆分,再以“=”拆分)InName = {string} "Comp" ExName = {string} "comp" Type = {string} "FLOAT"
newItem.Info, err = common.StringToTag(tagStr)
if err != nil {
return nil, fmt.Errorf("failed parse tag: %s", err.Error())
}
newItem.Info.InName = f.Name //将使用反射从结构获取的属性名称赋值给这个item的info属性的InName属性,但是这个属性已经有正确的值了呀
newItem.GoType = f.Type //将使用反射从结构获取的属性类型赋值给item的GoType属性
if f.Type.Kind() == reflect.Ptr {
newItem.GoType = f.Type.Elem()
newItem.Info.RepetitionType = parquet.FieldRepetitionType_OPTIONAL
}
stack = append(stack, newItem) //将封装了obj一个属性的item存起来
}
} else if item.GoType.Kind() == reflect.Slice &&
item.Info.RepetitionType != parquet.FieldRepetitionType_REPEATED {
schema := parquet.NewSchemaElement()
schema.Name = item.Info.InName
rt1 := item.Info.RepetitionType
schema.RepetitionType = &rt1
var numField int32 = 1
schema.NumChildren = &numField
ct1 := parquet.ConvertedType_LIST
schema.ConvertedType = &ct1
schemaElements = append(schemaElements, schema)
newInfo = common.NewTag()
common.DeepCopy(item.Info, newInfo)
infos = append(infos, newInfo)
schema = parquet.NewSchemaElement()
schema.Name = "List"
rt2 := parquet.FieldRepetitionType_REPEATED
schema.RepetitionType = &rt2
schema.NumChildren = &numField
schemaElements = append(schemaElements, schema)
newInfo = common.NewTag()
common.DeepCopy(item.Info, newInfo)
newInfo.InName, newInfo.ExName = "List", "list"
infos = append(infos, newInfo)
newItem := NewItem()
newItem.Info = common.GetValueTagMap(item.Info) //有问题
newItem.Info.InName = "Element"
newItem.Info.ExName = "element"
newItem.GoType = item.GoType.Elem()
if newItem.GoType.Kind() == reflect.Ptr {
newItem.Info.RepetitionType = parquet.FieldRepetitionType_OPTIONAL
newItem.GoType = item.GoType.Elem().Elem()
} else {
newItem.Info.RepetitionType = parquet.FieldRepetitionType_REQUIRED
}
stack = append(stack, newItem)
} else if item.GoType.Kind() == reflect.Slice &&
item.Info.RepetitionType == parquet.FieldRepetitionType_REPEATED {
newItem := NewItem()
newItem.Info = item.Info
newItem.GoType = item.GoType.Elem()
stack = append(stack, newItem)
} else if item.GoType.Kind() == reflect.Map {
schema := parquet.NewSchemaElement()
schema.Name = item.Info.InName
rt1 := item.Info.RepetitionType
schema.RepetitionType = &rt1
var numField1 int32 = 1
schema.NumChildren = &numField1
ct1 := parquet.ConvertedType_MAP
schema.ConvertedType = &ct1
schemaElements = append(schemaElements, schema)
newInfo = common.NewTag()
common.DeepCopy(item.Info, newInfo)
infos = append(infos, newInfo)
schema = parquet.NewSchemaElement()
schema.Name = "Key_value"
rt2 := parquet.FieldRepetitionType_REPEATED
schema.RepetitionType = &rt2
var numField2 int32 = 2
schema.NumChildren = &numField2
ct2 := parquet.ConvertedType_MAP_KEY_VALUE
schema.ConvertedType = &ct2
schemaElements = append(schemaElements, schema)
newInfo = common.NewTag()
common.DeepCopy(item.Info, newInfo)
newInfo.InName, newInfo.ExName = "Key_value", "key_value"
infos = append(infos, newInfo)
newItem := NewItem()
newItem.Info = common.GetValueTagMap(item.Info)
newItem.GoType = item.GoType.Elem()
if newItem.GoType.Kind() == reflect.Ptr {
newItem.Info.RepetitionType = parquet.FieldRepetitionType_OPTIONAL
newItem.GoType = item.GoType.Elem().Elem()
} else {
newItem.Info.RepetitionType = parquet.FieldRepetitionType_REQUIRED
}
stack = append(stack, newItem)
newItem = NewItem()
newItem.Info = common.GetKeyTagMap(item.Info)
newItem.GoType = item.GoType.Key()
newItem.Info.RepetitionType = parquet.FieldRepetitionType_REQUIRED
stack = append(stack, newItem)
} else {
schema, err := common.NewSchemaElementFromTagMap(item.Info)
if err != nil {
return nil, fmt.Errorf("failed to create schema from tag map: %s", err.Error())
}
schemaElements = append(schemaElements, schema)
newInfo = common.NewTag()
common.DeepCopy(item.Info, newInfo)
infos = append(infos, newInfo)
}
}
res := NewSchemaHandlerFromSchemaList(schemaElements)
res.Infos = infos
res.CreateInExMap()
return res, nil
}