完整过程
1.观察数据,并开发一个初始化schema
拿到数据集
观察样例数据
指定模型
2.在janus里定义schema
3.import data and 遍历图去确定数据模型的缺点
4.在janus修改schema
5.变更数据适配新schema
6.确认数据模型可以回答我们的业务问题
1.数据模型
原始数据
抽象字段:
- concert
- piece
- orchestra
- composer
- conductor
- soloist
合并字段
- concert
- work
piece
orchestra
artist
composer
conductor
soloist
定义schema
2.janus定义schema
2.1 定义vertex label
bin/gremlin.sh
graph = JanusGraphFactory.build().set('storage.backend', 'inmemory').open()
mgmt = graph.openManagement()
2.2 定义vertex properties and attach to label
vertex
Orchestra = mgmt.makeVertexLabel('Orchestra').make()
Artist = mgmt.makeVertexLabel('Artist').make()
Work = mgmt.makeVertexLabel('Work').make()
Concert = mgmt.makeVertexLabel('Concert').make()
work's properties
title = mgmt.makePropertyKey('title').dataType(String.class).cardinality(Cardinality.SINGLE).make()
compositionDate = mgmt.makePropertyKey('compositionYear').dataType(Integer.class).cardinality(Cardinality.SINGLE).make()
soloInstrument = mgmt.makePropertyKey('soloInstrument').dataType(String.class).cardinality(Cardinality.SINGLE).make()
mgmt.addProperties(Work, title, compositionDate, soloInstrument)
other's properties
name = mgmt.makePropertyKey('name').dataType(String.class).cardinality(Cardinality.SINGLE).make()
nationality = mgmt.makePropertyKey('nationality').dataType(String.class).cardinality(Cardinality.SINGLE).make()
gender = mgmt.makePropertyKey('gender').dataType(String.class).cardinality(Cardinality.SINGLE).make()
mgmt.addProperties(Orchestra, name)
mgmt.addProperties(Concert, name)
mgmt.addProperties(Artist, name, nationality, gender)
2.3 定义edge label and properties
COMPOSER = mgmt.makeEdgeLabel('COMPOSER').multiplicity(MANY2ONE).make()
SOLOIST = mgmt.makeEdgeLabel('SOLOIST').multiplicity(SIMPLE).make()
CONDUCTOR = mgmt.makeEdgeLabel('CONDUCTOR').multiplicity(SIMPLE).make()
ORCHESTRA = mgmt.makeEdgeLabel('ORCHESTRA').multiplicity(SIMPLE).make()
INCLUDES = mgmt.makeEdgeLabel('INCLUDES').multiplicity(SIMPLE).make()
2.4 create connection
mgmt.addConnection(COMPOSER, Work, Artist)
mgmt.addConnection(SOLOIST, Work, Artist)
mgmt.addConnection(CONDUCTOR, Work, Artist)
mgmt.addConnection(ORCHESTRA, Concert, Orchestra)
mgmt.addConnection(INCLUDES, Concert, Work)
2.5 commit tx
mgmt.commit()
3.导入数据并测试
g = graph.traversal()
create vertex
gremlin> ChicagoSymphonyOrchestra = g.addV('Orchestra').property('name','Chicago S ymphony Orchestra')
==>v[4208]
gremlin> SalonenAndYoYoMa = g.addV('Concert').property('name','Salonen & Yo-Yo Ma')
==>v[4216]
gremlin> CelloConcertoWork = g.addV('Work').property('title','Cello Concerto').property('compositionDate',2017).property('soloInstrument','cello')
==>v[8312]
gremlin> SalonenArtist = g.addV('Artist').property('name','Salonen').property('nationality', 'Finnish').property('gender','male')
==>v[8304]
gremlin> YoYoMaArtist = g.addV('Artist').property('name','Yo-Yo Ma').property('nationality', '?').property('gender','female')
==>v[4272]
create edge
g.addE("ORCHESTRA").from(g.V(4216)).to(g.V(4208))
g.addE("INCLUDES").from(g.V(4216)).to(g.V(8312))
g.addE("CONDUCTOR").from(g.V(8312)).to(g.V(8304))
g.addE("COMPOSER").from(g.V(8312)).to(g.V(8304))
g.addE("SOLOIST").from(g.V(8312)).to(g.V(4272))
测试
gremlin> g.V().has('Artist', 'name', 'Salonen').in('COMPOSER').out('CONDUCTOR').path().by('name').by('title').by('name')
15:08:01 WARN org.janusgraph.graphdb.transaction.StandardJanusGraphTx - Query requires iterating over all vertices [(~label = Artist AND name = Salonen)]. For better performance, use indexes
==>[Salonen,Cello Concerto,Salonen]
question
- Salonen是什么时候导演了他的Cello Concerto
- 和哪个乐队
原因
数据模型是Artist直接到Work
如果是多个Artist到Work,无法知道Yo-Yo Ma和Gilbert的关系。
解决
增加演出Performance Vertex,在Artist和Work中间。
4.改进Schema
修改schema
mgmt = graph.openManagement()
Perfomance = mgmt.makeVertexLabel('Performance').make()
performanceDate = mgmt.makePropertyKey('performanceDate').dataType(String.class).cardinality(Cardinality.SINGLE).make()
mgmt.addProperties(Perfomance, performanceDate)
PERFORMED = mgmt.makeEdgeLabel('PERFORMED').multiplicity(ONE2MANY).make()
Orchestra = mgmt.getVertexLabel('Orchestra')
Artist = mgmt.getVertexLabel('Artist')
Work = mgmt.getVertexLabel('Work')
Concert = mgmt.getVertexLabel('Concert')
SOLOIST = mgmt.getEdgeLabel('SOLOIST')
CONDUCTOR = mgmt.getEdgeLabel('CONDUCTOR')
ORCHESTRA = mgmt.getEdgeLabel('ORCHESTRA')
INCLUDES = mgmt.getEdgeLabel('INCLUDES')
mgmt.addConnection(SOLOIST, Perfomance, Artist)
mgmt.addConnection(CONDUCTOR, Perfomance, Artist)
mgmt.addConnection(ORCHESTRA, Perfomance, Orchestra)
mgmt.addConnection(INCLUDES, Concert, Perfomance)
mgmt.addConnection(PERFORMED, Work, Perfomance)
mgmt.commit()
5.变更数据
connect Performance to Concert and Work
g.V().hasLabel('Work').as('w').in('INCLUDES').hasLabel('Concert').as('c').map(addV('Performance').as('p').property('performanceDate', '2017-03-09')).addE('PERFORMED').from('w').select('p').addE('INCLUDES').from('c').iterate()
connect Conductor and Artist to Perfomance
remove connnection from Work
g.V().hasLabel('Performance').as('p').in('PERFORMED').outE('CONDUCTOR').as('OLD').inV().as('cond').addE('CONDUCTOR').from('p').select('OLD').drop().iterate()
g.V().hasLabel('Performance').as('p').in('PERFORMED').outE('SOLOIST').as('OLD').inV().as('soloist').addE('SOLOIST').from('p').select('OLD').drop().iterate()
connect Orchestra to Performance
g.V().hasLabel('Performance').as('p').in('PERFORMED').in('INCLUDES').out('ORCHESTRA').addE('ORCHESTRA').from('p').iterate()
assert 验证data
assert 1 == g.V().hasLabel('Performance').out('SOLOIST').hasLabel('Artist').count().next()
assert 0 == g.V().hasLabel('Work').outE('CONDUCTOR').count().next()
assert 0 == g.V().hasLabel('Work').outE('SOLOIST').count().next()
assert 0 == g.V().hasLabel('Work').outE('ORCHESTRA').count().next()
6.验证
g.V().hasLabel('Artist').as('a').in('COMPOSER').out('PERFORMED').out('CONDUCTOR').where(eq('a')).values('name')
g.V().hasLabel('Artist').as('a').inE('COMPOSER').outV().outE('PERFORMED').inV().outE('CONDUCTOR').inV().where(eq('a')).path().by('name').by(label).by('title').by(label).by('performanceDate').by(label).by('name')