I have a data in format:
id1 id2 value Something like
1 234 0.2
1 235 0.1
and so on. I want to convert it in json format:
{
"nodes": [ {"name":"1"}, #first element
{"name":"234"}, #second element
{"name":"235"} #third element
] ,
"links":[{"source":1,"target":2,"value":0.2},
{"source":1,"target":3,"value":0.1}
]
}
So, from the original data to above format.. the nodes contain all the set of (distinct) names present in the original data and the links are basically the line number of source and target in the values list returned by nodes. For example:
1 234 0.2
1 is in the first element in the list of values holded by the key "nodes" 234 is the second element in the list of values holded by the key "nodes"
Hence the link dictionary is {"source":1,"target":2,"value":0.2}
How do i do this efficiently in python.. I am sure there should be better way than what I am doing which is so messy :( Here is what I am doing from collections import defaultdict
def open_file(filename,output=None):
f = open(filename,"r")
offset = 3429
data_dict = {}
node_list = []
node_dict = {}
link_list = []
num_lines = 0
line_ids = []
for line in f:
line = line.strip()
tokens = line.split()
mod_wid = int(tokens[1]) + offset
if not node_dict.has_key(tokens[0]):
d = {"name": tokens[0],"group":1}
node_list.append(d)
node_dict[tokens[0]] = True
line_ids.append(tokens[0])
if not node_dict.has_key(mod_wid):
d = {"name": str(mod_wid),"group":1}
node_list.append(d)
node_dict[mod_wid] = True
line_ids.append(mod_wid)
link_d = {"source": line_ids.index(tokens[0]),"target":line_ids.index(mod_wid),"value":tokens[2]}
link_list.append(link_d)
if num_lines > 10000:
break
num_lines +=1
data_dict = {"nodes":node_list, "links":link_list}
print "{\n"
for k,v in data_dict.items():
print '"'+k +'"' +":\n [ \n "
for each_v in v:
print each_v ,","
print "\n],"
print "}"
open_file("lda_input.tsv")
csvmodule and thejsonmodule.