|
@@ -0,0 +1,817 @@
|
|
1
|
+#!/usr/bin/python3 -ttOO
|
|
2
|
+'''
|
|
3
|
+Artificial Intelligence Research Group
|
|
4
|
+University of Lleida
|
|
5
|
+'''
|
|
6
|
+
|
|
7
|
+# Libraries
|
|
8
|
+
|
|
9
|
+import os
|
|
10
|
+import sys
|
|
11
|
+import argparse
|
|
12
|
+import xml.etree.ElementTree as ET
|
|
13
|
+import math
|
|
14
|
+import networkx
|
|
15
|
+import ast
|
|
16
|
+import udebg
|
|
17
|
+
|
|
18
|
+# Classes
|
|
19
|
+
|
|
20
|
+class RedditG():
|
|
21
|
+ '''
|
|
22
|
+ Reddit post in graph structure for its analysis
|
|
23
|
+ '''
|
|
24
|
+ def __init__(self, input_file):
|
|
25
|
+ self.xml = None
|
|
26
|
+ self.root_id = None
|
|
27
|
+ self.DebT = networkx.DiGraph() # Debate Tree
|
|
28
|
+ self.PDebT = networkx.DiGraph() # Pruned Debate Tree
|
|
29
|
+ self.SDebT = networkx.DiGraph() # Two-Sided Debate Tree
|
|
30
|
+ self.WBDebG = None # Weighted Bipartite Debate Graph
|
|
31
|
+ self.comment_id = {}
|
|
32
|
+ self.min_weight = None
|
|
33
|
+ self.max_weight = None
|
|
34
|
+ self.VAF_accepted = None
|
|
35
|
+ self.read_xml(input_file)
|
|
36
|
+
|
|
37
|
+ def remove_deleted_comments(self):
|
|
38
|
+ rg = self.DebT.reverse() # Reverse graph (direction of edges reversed)
|
|
39
|
+ while True: # To avoid "RuntimeError: dictionary changed size during iteration" in inner loop
|
|
40
|
+ changed = False
|
|
41
|
+ for n, nd in self.DebT.nodes(data = True):
|
|
42
|
+ if nd['data'].get('author') == 'None':
|
|
43
|
+ if n in self.DebT: # Not already removed (appears in a previous subtree)
|
|
44
|
+ # Get subtree of node n and remove it
|
|
45
|
+ st = networkx.algorithms.traversal.depth_first_search.dfs_tree(rg, n)
|
|
46
|
+ self.DebT.remove_nodes_from(st.nodes())
|
|
47
|
+ changed = True
|
|
48
|
+ break
|
|
49
|
+ if not changed:
|
|
50
|
+ break
|
|
51
|
+
|
|
52
|
+ def read_xml(self, input_file):
|
|
53
|
+ '''
|
|
54
|
+ Read XML file with the conversation
|
|
55
|
+ '''
|
|
56
|
+ print('Reading xml input file...')
|
|
57
|
+ self.xml = ET.parse(input_file)
|
|
58
|
+ al = self.xml.find('argument-list')
|
|
59
|
+ for arg in al.iter('arg'):
|
|
60
|
+ if 'title' in arg.attrib:
|
|
61
|
+ self.root_id = arg.attrib['id']
|
|
62
|
+ if arg.attrib['author'] == 'None': # To not delete full tree when root node author is deleted
|
|
63
|
+ arg.attrib['author'] = 'root_node_author'
|
|
64
|
+ self.DebT.add_node(arg.attrib['id'], data = arg)
|
|
65
|
+ ap = self.xml.find('argument-pairs')
|
|
66
|
+ for pair in ap.iter('pair'): # Argument pair (relation) t replies to h
|
|
67
|
+ self.DebT.add_edge(pair.find('t').get('id'), pair.find('h').get('id'), data = pair)
|
|
68
|
+ self.remove_deleted_comments()
|
|
69
|
+
|
|
70
|
+ def wia2021_DebT(self, args):
|
|
71
|
+ '''
|
|
72
|
+ DebT for wia2021
|
|
73
|
+ '''
|
|
74
|
+ print('Generating DebT for wia2021...')
|
|
75
|
+ # Set chronological id to comments in DebT
|
|
76
|
+ id_list = sorted([n for n, nd in self.DebT.nodes(data = True) if 'title' not in nd['data'].attrib])
|
|
77
|
+ for i, c_id in enumerate(id_list):
|
|
78
|
+ self.DebT.nodes[c_id]['chrono_id'] = i + 1 # chrono_id for root node set below
|
|
79
|
+ print(' Number of nodes DebT = {}'.format(self.DebT.number_of_nodes()))
|
|
80
|
+ # Initializations
|
|
81
|
+ self.DebT.nodes[self.root_id]['sentiment_not_normalized'] = 0
|
|
82
|
+ self.DebT.nodes[self.root_id]['chrono_id'] = 0
|
|
83
|
+ # BFS on DebT to compute sentiment not normalized [-2, 2]
|
|
84
|
+ list_edges = [e for e in self.DebT.in_edges(self.root_id)]
|
|
85
|
+ while list_edges:
|
|
86
|
+ current_edge = list_edges.pop(0)
|
|
87
|
+ node_id = current_edge[0]
|
|
88
|
+ sentiment = self.ccia18_sentiment(False, ast.literal_eval(self.DebT.nodes[node_id]['data'].get('sentiment_distribution')), args)
|
|
89
|
+ self.DebT.nodes[node_id]['sentiment_not_normalized'] = sentiment
|
|
90
|
+ list_edges.extend([e for e in self.DebT.in_edges(node_id)])
|
|
91
|
+
|
|
92
|
+ if args.draw_graphs:
|
|
93
|
+ self.wia2021_draw_DebT(args)
|
|
94
|
+
|
|
95
|
+ def wia2021_SDebT(self, args):
|
|
96
|
+ '''
|
|
97
|
+ SDebT for wia2021
|
|
98
|
+ '''
|
|
99
|
+ print('Generating SDebT for wia2021...')
|
|
100
|
+ # Copy DebT to SDebT
|
|
101
|
+ self.SDebT.add_nodes_from(self.DebT.nodes(data = True))
|
|
102
|
+ for e1, e2, ed in self.DebT.edges(data = True):
|
|
103
|
+ self.SDebT.add_edge(e1, e2, data = self.DebT[e1][e2]['data'])
|
|
104
|
+ # Initializations
|
|
105
|
+ self.SDebT.nodes[self.root_id]['side'] = 1
|
|
106
|
+ # BFS to compute the side of each node
|
|
107
|
+ list_edges = [e for e in self.SDebT.in_edges(self.root_id)]
|
|
108
|
+ while list_edges:
|
|
109
|
+ current_edge = list_edges.pop(0)
|
|
110
|
+ node_id = current_edge[0]
|
|
111
|
+ parent_node_id = current_edge[1]
|
|
112
|
+ if (self.SDebT.nodes[parent_node_id]['side'] == 1 and self.DebT.nodes[node_id]['sentiment_not_normalized'] > 0) or (self.SDebT.nodes[parent_node_id]['side'] == -1 and self.DebT.nodes[node_id]['sentiment_not_normalized'] <= 0):
|
|
113
|
+ self.SDebT.nodes[node_id]['side'] = 1
|
|
114
|
+ else:
|
|
115
|
+ self.SDebT.nodes[node_id]['side'] = -1
|
|
116
|
+ list_edges.extend([e for e in self.SDebT.in_edges(node_id)])
|
|
117
|
+
|
|
118
|
+ if args.draw_graphs:
|
|
119
|
+ self.wia2021_draw_SDebT(args)
|
|
120
|
+
|
|
121
|
+ def wia2021_draw_DebT(self, args):
|
|
122
|
+ '''
|
|
123
|
+ Drawing wia2021 DebT
|
|
124
|
+ '''
|
|
125
|
+ print('Drawing wia2021 DebT...')
|
|
126
|
+ gv = networkx.nx_agraph.to_agraph(self.DebT)
|
|
127
|
+ gv.node_attr['style'] = 'filled'
|
|
128
|
+ gv.node_attr['fixedsize'] = 'true'
|
|
129
|
+ gv.node_attr['width'] = '0.4'
|
|
130
|
+ gv.node_attr['height'] = '0.4'
|
|
131
|
+ gv.node_attr['fillcolor'] = '#0000FF'
|
|
132
|
+ gv.node_attr['fontcolor'] = '#FFFFFF'
|
|
133
|
+
|
|
134
|
+ for n in gv.nodes():
|
|
135
|
+ n.attr['label'] = str(self.DebT.nodes[n]['chrono_id'])
|
|
136
|
+
|
|
137
|
+ gv.edge_attr['color'] = '#000000'
|
|
138
|
+ for e in gv.edges():
|
|
139
|
+ s = self.DebT.nodes[e[0]]['sentiment_not_normalized']
|
|
140
|
+ if s > 0:
|
|
141
|
+ contrast, color = udebg.get_weighted_color([0x00, 0xFF, 0x00], 0, 2, s)
|
|
142
|
+ e.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, color)])
|
|
143
|
+ elif s < 0:
|
|
144
|
+ contrast, color = udebg.get_weighted_color([0xFF, 0x00, 0x00], 0, 2, -s)
|
|
145
|
+ e.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, color)])
|
|
146
|
+
|
|
147
|
+ gv.layout(prog = 'dot', args='-Goverlap=false -Gnodesep=0.2 -Granksep=0.2 -Grankdir=BT -GK=800 -Gstart=17 -Gmaxiter=600')
|
|
148
|
+ gv.draw("%s.debt.png" % args.input_file, format = 'png')
|
|
149
|
+
|
|
150
|
+ def wia2021_draw_SDebT(self, args):
|
|
151
|
+ '''
|
|
152
|
+ Drawing wia2021 SDebT
|
|
153
|
+ '''
|
|
154
|
+ print('Drawing wia2021 SDebT...')
|
|
155
|
+ gv = networkx.nx_agraph.to_agraph(self.SDebT)
|
|
156
|
+ gv.node_attr['style'] = 'filled'
|
|
157
|
+ gv.node_attr['fixedsize'] = 'true'
|
|
158
|
+ gv.node_attr['width'] = '0.4'
|
|
159
|
+ gv.node_attr['height'] = '0.4'
|
|
160
|
+ gv.node_attr['fillcolor'] = '#0000FF'
|
|
161
|
+ gv.node_attr['fontcolor'] = '#FFFFFF'
|
|
162
|
+
|
|
163
|
+ for n in gv.nodes():
|
|
164
|
+ n.attr['label'] = str(self.SDebT.nodes[n]['chrono_id'])
|
|
165
|
+ side = self.SDebT.nodes[n]['side']
|
|
166
|
+ if side == 1:
|
|
167
|
+ n.attr['fontcolor'] = '#000000'
|
|
168
|
+ n.attr['fillcolor'] = '#4FCFFF' # light green = '#6FFF6F', cyan = '#4FCFFF'
|
|
169
|
+ else:
|
|
170
|
+ n.attr['fontcolor'] = '#FFFFFF'
|
|
171
|
+ n.attr['fillcolor'] = '#00007F' # light red = '#FF6F6F', dark blue = '#00007F'
|
|
172
|
+
|
|
173
|
+ gv.edge_attr['color'] = '#000000'
|
|
174
|
+ for e in gv.edges():
|
|
175
|
+ s = self.SDebT.nodes[e[0]]['sentiment_not_normalized']
|
|
176
|
+ if s > 0:
|
|
177
|
+ contrast, color = udebg.get_weighted_color([0x00, 0xFF, 0x00], 0, 2, s)
|
|
178
|
+ e.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, color)])
|
|
179
|
+ elif s < 0:
|
|
180
|
+ contrast, color = udebg.get_weighted_color([0xFF, 0x00, 0x00], 0, 2, -s)
|
|
181
|
+ e.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, color)])
|
|
182
|
+
|
|
183
|
+ gv.layout(prog = 'dot', args='-Goverlap=false -Gnodesep=0.2 -Granksep=0.2 -Grankdir=BT -GK=800 -Gstart=17 -Gmaxiter=600')
|
|
184
|
+ gv.draw("%s.sdebt.png" % args.input_file, format = 'png')
|
|
185
|
+
|
|
186
|
+ def ccia18_sentiment(self, normalized, sentiment_distribution, args):
|
|
187
|
+ '''
|
|
188
|
+ Computes the sentiment from a sentiment distribution of 5 values
|
|
189
|
+ Normalized: [very neg, neg, neutral, pos, very pos] --> [neg = -1, neutral = 0, pos = 1]
|
|
190
|
+ Not normalized: --> [-2, 2]
|
|
191
|
+ '''
|
|
192
|
+ sentiment_relevance = [-2, -1, 0, 1, 2]
|
|
193
|
+ res = [a * b for a, b in zip(sentiment_relevance, sentiment_distribution)]
|
|
194
|
+ res = sum(res)
|
|
195
|
+ if not normalized:
|
|
196
|
+ return res
|
|
197
|
+ if abs(res) > args.alpha:
|
|
198
|
+ if res > 0:
|
|
199
|
+ return 1 # Positive
|
|
200
|
+ else:
|
|
201
|
+ return -1 # Negative
|
|
202
|
+ else:
|
|
203
|
+ return 0 # Neutral
|
|
204
|
+
|
|
205
|
+ def get_node_color(self, base_color, w):
|
|
206
|
+ hw = 0xCF
|
|
207
|
+ if w >= self.max_weight:
|
|
208
|
+ hw = 0
|
|
209
|
+ elif self.max_weight > self.min_weight:
|
|
210
|
+ hw = int(hw * (float(self.max_weight - w) / float(self.max_weight - self.min_weight)))
|
|
211
|
+ color = [a | b for a, b in zip(base_color, [hw, hw, hw])]
|
|
212
|
+ return color
|
|
213
|
+
|
|
214
|
+ def ccia18_analysis(self, args):
|
|
215
|
+ '''
|
|
216
|
+ Weighted Bipartite Graph analysis
|
|
217
|
+ '''
|
|
218
|
+ print('Generating PDebT and WBDebG...')
|
|
219
|
+ # Copy DebT to PDebT
|
|
220
|
+ self.PDebT.add_nodes_from(self.DebT.nodes(data = True))
|
|
221
|
+ for e1, e2, ed in self.DebT.edges(data = True):
|
|
222
|
+ self.PDebT.add_edge(e1, e2, data = self.DebT[e1][e2]['data'])
|
|
223
|
+ # Initializations
|
|
224
|
+ self.PDebT.nodes[self.root_id]['bipartite_set'] = 1 # 1 in favor of root, -1 not in favor
|
|
225
|
+ self.PDebT.nodes[self.root_id]['sentiment'] = 1
|
|
226
|
+ self.DebT.nodes[self.root_id]['sentiment_not_normalized'] = 0
|
|
227
|
+ rg = self.PDebT.reverse() # Reverse graph (direction of edges reversed)
|
|
228
|
+ # DFS on PDebT before removing nodes to save DebT sentiment not normalized
|
|
229
|
+ list_edges = [e for e in self.PDebT.in_edges(self.root_id)]
|
|
230
|
+ self.PDebT.nodes[self.root_id]['depth'] = 1
|
|
231
|
+ max_depth = 1
|
|
232
|
+ while list_edges:
|
|
233
|
+ current_edge = list_edges.pop()
|
|
234
|
+ node_id = current_edge[0]
|
|
235
|
+ self.PDebT.nodes[node_id]['depth'] = self.PDebT.nodes[current_edge[1]]['depth'] + 1
|
|
236
|
+ if self.PDebT.nodes[node_id]['depth'] > max_depth:
|
|
237
|
+ max_depth = self.PDebT.nodes[node_id]['depth']
|
|
238
|
+ sentiment = self.ccia18_sentiment(False, ast.literal_eval(self.PDebT.nodes[node_id]['data'].get('sentiment_distribution')), args)
|
|
239
|
+ self.DebT.nodes[node_id]['sentiment_not_normalized'] = sentiment
|
|
240
|
+ list_edges.extend([e for e in self.PDebT.in_edges(node_id)])
|
|
241
|
+ self.PDebT.nodes[self.root_id]['max_depth'] = max_depth
|
|
242
|
+ # DFS and prune PDebT
|
|
243
|
+ list_edges = [e for e in self.PDebT.in_edges(self.root_id)]
|
|
244
|
+ while list_edges:
|
|
245
|
+ current_edge = list_edges.pop()
|
|
246
|
+ node_id = current_edge[0]
|
|
247
|
+ father_id = current_edge[1]
|
|
248
|
+ sentiment = self.ccia18_sentiment(True, ast.literal_eval(self.PDebT.nodes[node_id]['data'].get('sentiment_distribution')), args)
|
|
249
|
+ if sentiment == 1: # Positive
|
|
250
|
+ self.PDebT.nodes[node_id]['bipartite_set'] = self.PDebT.nodes[father_id]['bipartite_set']
|
|
251
|
+ elif sentiment == -1: # Negative
|
|
252
|
+ self.PDebT.nodes[node_id]['bipartite_set'] = -self.PDebT.nodes[father_id]['bipartite_set']
|
|
253
|
+ if sentiment == 0: # Neutral: remove subtree
|
|
254
|
+ st = networkx.algorithms.traversal.depth_first_search.dfs_tree(rg, node_id)
|
|
255
|
+ self.PDebT.remove_nodes_from(st.nodes())
|
|
256
|
+ else: # Not Neutral
|
|
257
|
+ self.PDebT.nodes[node_id]['sentiment'] = sentiment
|
|
258
|
+ list_edges.extend([e for e in self.PDebT.in_edges(node_id)])
|
|
259
|
+ # Create the WBDebG
|
|
260
|
+ self.WBDebG = self.PDebG_to_WBDebG(self.PDebT)
|
|
261
|
+
|
|
262
|
+ def PDebG_to_WBDebG(self, PDebT):
|
|
263
|
+ '''
|
|
264
|
+ Create the WBDebG from the PDebT
|
|
265
|
+ '''
|
|
266
|
+ WBDebG = networkx.DiGraph()
|
|
267
|
+ WBDebG.add_nodes_from(PDebT.nodes(data = True))
|
|
268
|
+ for e1, e2, ed in PDebT.edges(data = True):
|
|
269
|
+ if WBDebG.nodes[e1]['bipartite_set'] != WBDebG.nodes[e2]['bipartite_set']:
|
|
270
|
+ WBDebG.add_edge(e1, e2, data = PDebT[e1][e2]['data'])
|
|
271
|
+ return WBDebG
|
|
272
|
+
|
|
273
|
+ def WBDebG2xml(self, args):
|
|
274
|
+ '''
|
|
275
|
+ Saves self.WBDebG graph to xml file
|
|
276
|
+ '''
|
|
277
|
+ xml = ET.Element('entailment-corpus')
|
|
278
|
+ xml.append(ET.Comment(args2str(args)))
|
|
279
|
+ al_xml = ET.SubElement(xml, 'argument-list')
|
|
280
|
+ i = 1 # 0 for root
|
|
281
|
+ maxw = minw = scale_weight(int(self.WBDebG.nodes[self.root_id]['data'].get('score')), args)
|
|
282
|
+ for n_id, nd in self.WBDebG.nodes(data = True):
|
|
283
|
+ a = nd['data']
|
|
284
|
+ w = scale_weight(int(a.get('score')), args)
|
|
285
|
+ self.WBDebG.nodes[n_id]['weight'] = w
|
|
286
|
+ if w < minw:
|
|
287
|
+ minw = w
|
|
288
|
+ elif w > maxw:
|
|
289
|
+ maxw = w
|
|
290
|
+ a.set('weight', str(w))
|
|
291
|
+ a.set('bipartite_set', str(self.WBDebG.nodes[n_id]['bipartite_set']))
|
|
292
|
+ a.set('comment_id', a.get('id'))
|
|
293
|
+ if a.get('id') == self.root_id:
|
|
294
|
+ # Id 0 for root node
|
|
295
|
+ self.comment_id[a.get('id')] = '0'
|
|
296
|
+ a.set('id', '0')
|
|
297
|
+ else:
|
|
298
|
+ self.comment_id[a.get('id')] = str(i)
|
|
299
|
+ a.set('id', str(i))
|
|
300
|
+ i = i + 1
|
|
301
|
+ al_xml.append(a)
|
|
302
|
+ al_xml.set('minweight', str(minw))
|
|
303
|
+ al_xml.set('maxweight', str(maxw))
|
|
304
|
+ self.min_weight = minw
|
|
305
|
+ self.max_weight = maxw
|
|
306
|
+ xml.set('num_nodes', str(i))
|
|
307
|
+ ap_xml = ET.SubElement(xml, 'argument-pairs')
|
|
308
|
+ i = 0
|
|
309
|
+ for e1, e2, ed in self.WBDebG.edges(data = True):
|
|
310
|
+ p = ed['data']
|
|
311
|
+ p.set('entailment', 'ATTACKS')
|
|
312
|
+ t = p.find('t')
|
|
313
|
+ t.set('comment_id', t.get('id'))
|
|
314
|
+ t.set('id', self.WBDebG.nodes[t.get('id')]['data'].get('id'))
|
|
315
|
+ h = p.find('h')
|
|
316
|
+ h.set('comment_id', h.get('id'))
|
|
317
|
+ h.set('id', self.WBDebG.nodes[h.get('id')]['data'].get('id'))
|
|
318
|
+ ap_xml.append(p)
|
|
319
|
+ i = i + 1
|
|
320
|
+ xml.set('num_edges', str(i))
|
|
321
|
+ ET.ElementTree(xml).write("%s.wbg.xml" % args.input_file)
|
|
322
|
+
|
|
323
|
+ def WBDebG_to_xml(self, args, WBDebG, tmp_file_name):
|
|
324
|
+ '''
|
|
325
|
+ Saves a WBDebG graph to a xml file, it uses the information generated in WBDebG2xml
|
|
326
|
+ '''
|
|
327
|
+ xml = ET.Element('entailment-corpus')
|
|
328
|
+ xml.append(ET.Comment(args2str(args)))
|
|
329
|
+ al_xml = ET.SubElement(xml, 'argument-list')
|
|
330
|
+ maxw = minw = scale_weight(int(WBDebG.nodes[self.root_id]['data'].get('score')), args)
|
|
331
|
+ for n_id, nd in WBDebG.nodes(data = True):
|
|
332
|
+ a = nd['data']
|
|
333
|
+ w = scale_weight(int(a.get('score')), args)
|
|
334
|
+ WBDebG.nodes[n_id]['weight'] = w
|
|
335
|
+ if w < minw:
|
|
336
|
+ minw = w
|
|
337
|
+ elif w > maxw:
|
|
338
|
+ maxw = w
|
|
339
|
+ al_xml.append(a)
|
|
340
|
+ al_xml.set('minweight', str(minw))
|
|
341
|
+ al_xml.set('maxweight', str(maxw))
|
|
342
|
+ xml.set('num_nodes', str(WBDebG.number_of_nodes()))
|
|
343
|
+ ap_xml = ET.SubElement(xml, 'argument-pairs')
|
|
344
|
+ for e1, e2, ed in WBDebG.edges(data = True):
|
|
345
|
+ p = ed['data']
|
|
346
|
+ ap_xml.append(p)
|
|
347
|
+ xml.set('num_edges', str(WBDebG.number_of_edges()))
|
|
348
|
+ ET.ElementTree(xml).write('%s' % tmp_file_name)
|
|
349
|
+
|
|
350
|
+ def draw_ccia18_PCT(self, args):
|
|
351
|
+ '''
|
|
352
|
+ Drawing Polarized Comment Tree
|
|
353
|
+ '''
|
|
354
|
+ print('Drawing Polarized Comment Tree...')
|
|
355
|
+ gv = networkx.nx_agraph.to_agraph(self.PDebT)
|
|
356
|
+ gv.node_attr['style'] = 'filled'
|
|
357
|
+ gv.node_attr['fixedsize'] = 'true'
|
|
358
|
+ gv.node_attr['width'] = '0.4'
|
|
359
|
+ gv.node_attr['height'] = '0.4'
|
|
360
|
+ for n in gv.nodes():
|
|
361
|
+ node_id = int(self.PDebT.nodes[n]['data'].get('id'))
|
|
362
|
+ n.attr['label'] = str(node_id)
|
|
363
|
+ if self.PDebT.nodes[n]['sentiment'] == 1:
|
|
364
|
+ fillcolor = self.get_node_color([0x80, 0xFF, 0x40], self.WBDebG.nodes[n]['weight'])
|
|
365
|
+ else:
|
|
366
|
+ fillcolor = self.get_node_color([0x7F, 0x00, 0x7F], self.WBDebG.nodes[n]['weight'])
|
|
367
|
+ n.attr['fillcolor'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, fillcolor)])
|
|
368
|
+ gv.layout(prog = 'dot', args='-Goverlap=false -Gnodesep=0.2 -Granksep=0.2 -Grankdir=BT -GK=800 -Gstart=17 -Gmaxiter=600')
|
|
369
|
+ gv.draw("%s.pct.png" % args.input_file, format = 'png')
|
|
370
|
+
|
|
371
|
+ def draw_ccia18_WBG(self, args):
|
|
372
|
+ '''
|
|
373
|
+ Drawing Weighted Biparite Graph
|
|
374
|
+ '''
|
|
375
|
+ # TODO: Grouping nodes
|
|
376
|
+ # https://stackoverflow.com/questions/19674316/grouping-nodes-with-the-same-color-near-each-other-in-graphviz
|
|
377
|
+ if self.VAF_accepted:
|
|
378
|
+ print('Drawing Weighted Biparite Graph solution...')
|
|
379
|
+ output_file_name = '%s.wbg-sol.png' % args.input_file
|
|
380
|
+ else:
|
|
381
|
+ print('Drawing Weighted Biparite Graph...')
|
|
382
|
+ output_file_name = '%s.wbg.png' % args.input_file
|
|
383
|
+ gv = networkx.nx_agraph.to_agraph(self.WBDebG)
|
|
384
|
+ gv.node_attr['style'] = 'filled'
|
|
385
|
+ gv.node_attr['fixedsize'] = 'true'
|
|
386
|
+ gv.node_attr['width'] = '0.4'
|
|
387
|
+ gv.node_attr['height'] = '0.4'
|
|
388
|
+ #gv.edge_attr['color'] = '#FF8080'
|
|
389
|
+ for n in gv.nodes():
|
|
390
|
+ node_id = int(self.WBDebG.nodes[n]['data'].get('id'))
|
|
391
|
+ n.attr['label'] = str(node_id)
|
|
392
|
+ bordercolor = [0x00, 0x00, 0x00]
|
|
393
|
+ penwidth = 1
|
|
394
|
+ if self.WBDebG.nodes[n]['bipartite_set'] == 1:
|
|
395
|
+ fillcolor = self.get_node_color([0x00, 0x00, 0xFF], self.WBDebG.nodes[n]['weight'])
|
|
396
|
+ else:
|
|
397
|
+ fillcolor = self.get_node_color([0xFF, 0x00, 0x00], self.WBDebG.nodes[n]['weight'])
|
|
398
|
+ if self.VAF_accepted:
|
|
399
|
+ if node_id not in self.VAF_accepted:
|
|
400
|
+ bordercolor = fillcolor
|
|
401
|
+ penwidth = 3
|
|
402
|
+ fillcolor = self.get_node_color([0x00, 0x00, 0x00], self.WBDebG.nodes[n]['weight'])
|
|
403
|
+ # fillcolor format '#RRGGBB', for example: fillcolor = '#FF8080'
|
|
404
|
+ n.attr['fillcolor'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, fillcolor)])
|
|
405
|
+ n.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, bordercolor)])
|
|
406
|
+ n.attr['penwidth'] = penwidth
|
|
407
|
+ #for e in gv.edges():
|
|
408
|
+ # e.attr['color'] = '#FF8080'
|
|
409
|
+ gv.layout(prog = 'dot', args='-Goverlap=false -Gnodesep=0.2 -Granksep=0.2 -Grankdir=BT -GK=800 -Gstart=17 -Gmaxiter=600')
|
|
410
|
+ gv.draw(output_file_name, format = 'png')
|
|
411
|
+
|
|
412
|
+ def ccia2018_stats_to_file(self, args):
|
|
413
|
+ '''
|
|
414
|
+ Write WBDebG statistics and information to file
|
|
415
|
+ '''
|
|
416
|
+ print('Writting statistics to file...')
|
|
417
|
+ num_nodes = self.WBDebG.number_of_nodes()
|
|
418
|
+ out_str = 'CCIA 2018 stats\n---------------\nInput file = %s\n' % args.input_file
|
|
419
|
+ out_str += 'Number of nodes = %i\n' % num_nodes
|
|
420
|
+ out_str += 'Number of edges = %i\n' % self.WBDebG.number_of_edges()
|
|
421
|
+ out_str += 'PDebT maximum depth = %i\n\n' % self.PDebT.nodes[self.root_id]['max_depth']
|
|
422
|
+ yes_fav = not_fav = 0 # Number of comments in favor and not in favor
|
|
423
|
+ yes_fav_sol = not_fav_sol = 0 # Number of comments in favor and not in favor in the solution
|
|
424
|
+ yes_fav_solw = not_fav_solw = 0 # Weight of comments in favor and not in favor in the solution
|
|
425
|
+ for n, nd in self.WBDebG.nodes(data = True):
|
|
426
|
+ node_id = int(self.WBDebG.nodes[n]['data'].get('id'))
|
|
427
|
+ if nd['bipartite_set'] == 1:
|
|
428
|
+ yes_fav += 1
|
|
429
|
+ if node_id in self.VAF_accepted:
|
|
430
|
+ yes_fav_sol += 1
|
|
431
|
+ yes_fav_solw += nd['weight']
|
|
432
|
+ else:
|
|
433
|
+ not_fav += 1
|
|
434
|
+ if node_id in self.VAF_accepted:
|
|
435
|
+ not_fav_sol += 1
|
|
436
|
+ not_fav_solw += nd['weight']
|
|
437
|
+ out_str += '#Nodes in favor = %i (%.2f%% of total)\n' % (yes_fav, 100.0 * yes_fav / num_nodes)
|
|
438
|
+ out_str += '#Nodes not in favor = %i (%.2f%% of total)\n\n' % (not_fav, 100.0 * not_fav / num_nodes)
|
|
439
|
+
|
|
440
|
+ out_str += '#Nodes in favor in solution = %i (%.2f%% of in favor)\n' % (yes_fav_sol, 100.0 * yes_fav_sol / yes_fav)
|
|
441
|
+ out_str += '#Nodes not in favor in solution = %i (%.2f%% of not in favor)\n' % (not_fav_sol, 100.0 * not_fav_sol / not_fav)
|
|
442
|
+ out_str += 'Percentage difference = %0.2f%%\n\n' % abs(100.0 * yes_fav_sol / yes_fav - 100.0 * not_fav_sol / not_fav)
|
|
443
|
+
|
|
444
|
+ out_str += 'Weight of solution, in favor - not in favor = %i - %i = %i\n' % (yes_fav_solw, not_fav_solw, yes_fav_solw - not_fav_solw)
|
|
445
|
+ out_str += 'Weight of solution normalized [-1, 1] = %0.2f\n' % (float(yes_fav_solw - not_fav_solw) / float(yes_fav_solw + not_fav_solw))
|
|
446
|
+
|
|
447
|
+ # More author based stats for CCIA 2019
|
|
448
|
+ if args.ccia2019_order:
|
|
449
|
+ authors = {}
|
|
450
|
+ for n, nd in self.WBDebG.nodes(data = True):
|
|
451
|
+ yes_fav_in_sol = 0
|
|
452
|
+ not_fav_in_sol = 0
|
|
453
|
+ a = self.WBDebG.nodes[n]['data'].get('author')
|
|
454
|
+ if a not in authors:
|
|
455
|
+ authors[a] = {}
|
|
456
|
+ authors[a]['num_nodes'] = 0 # Number of nodes
|
|
457
|
+ authors[a]['num_nodes_yes_fav'] = 0 # Number of nodes in favor of root
|
|
458
|
+ authors[a]['num_nodes_not_fav'] = 0 # Number of nodes not in favor of root
|
|
459
|
+ authors[a]['num_nodes_in_sol'] = 0 # Number of nodes in solution
|
|
460
|
+ authors[a]['num_nodes_not_in_sol'] = 0 # Number of nodes not in solution
|
|
461
|
+ authors[a]['num_nodes_yes_fav_in_sol'] = 0 # Number of nodes in favor of root and in solution
|
|
462
|
+ authors[a]['num_nodes_not_fav_in_sol'] = 0 # Number of nodes not in favor of root and in solution
|
|
463
|
+ authors[a]['num_in_edges'] = 0 # Replies to this node
|
|
464
|
+ authors[a]['num_in_edges_pos'] = 0 # Replies to this node that support it
|
|
465
|
+ authors[a]['num_in_edges_neg'] = 0 # Replies to this node that not support it
|
|
466
|
+ authors[a]['num_init_nodes'] = 0 # Number of nodes before alpha prunning
|
|
467
|
+ authors[a]['sum_sentiment_radicality'] = 0 # sum_{c \in Comments} abs(sentiment_rel(c))
|
|
468
|
+ authors[a]['scores'] = [] # List of scores of the author's comments
|
|
469
|
+
|
|
470
|
+ authors[a]['num_nodes'] += 1
|
|
471
|
+ authors[a]['scores'].append(int(self.WBDebG.nodes[n]['data'].get('score')))
|
|
472
|
+ if self.WBDebG.nodes[n]['bipartite_set'] == 1:
|
|
473
|
+ authors[a]['num_nodes_yes_fav'] += 1
|
|
474
|
+ yes_fav_in_sol += 1
|
|
475
|
+ else:
|
|
476
|
+ authors[a]['num_nodes_not_fav'] += 1
|
|
477
|
+ not_fav_in_sol += 1
|
|
478
|
+ node_id = int(self.WBDebG.nodes[n]['data'].get('id'))
|
|
479
|
+ if node_id in self.VAF_accepted:
|
|
480
|
+ authors[a]['num_nodes_in_sol'] += 1
|
|
481
|
+ yes_fav_in_sol += 1
|
|
482
|
+ not_fav_in_sol += 1
|
|
483
|
+ else:
|
|
484
|
+ authors[a]['num_nodes_not_in_sol'] += 1
|
|
485
|
+ if yes_fav_in_sol == 2:
|
|
486
|
+ authors[a]['num_nodes_yes_fav_in_sol'] += 1
|
|
487
|
+ elif not_fav_in_sol == 2:
|
|
488
|
+ authors[a]['num_nodes_not_fav_in_sol'] += 1
|
|
489
|
+
|
|
490
|
+ authors[a]['num_in_edges'] += self.PDebT.in_degree(n)
|
|
491
|
+ for e in self.PDebT.in_edges(n):
|
|
492
|
+ if self.PDebT.nodes[e[0]]['bipartite_set'] == self.PDebT.nodes[e[1]]['bipartite_set']:
|
|
493
|
+ authors[a]['num_in_edges_pos'] += 1
|
|
494
|
+ else:
|
|
495
|
+ authors[a]['num_in_edges_neg'] += 1
|
|
496
|
+
|
|
497
|
+ for n, nd in self.DebT.nodes(data = True):
|
|
498
|
+ a = self.DebT.nodes[n]['data'].get('author')
|
|
499
|
+ if a in authors:
|
|
500
|
+ authors[a]['num_init_nodes'] += 1 # Counting nodes removed by alpha prune
|
|
501
|
+ authors[a]['sum_sentiment_radicality'] += abs(self.DebT.nodes[n]['sentiment_not_normalized'])
|
|
502
|
+
|
|
503
|
+ out_str += self.get_stats_ccia2019(authors, args)
|
|
504
|
+
|
|
505
|
+ # Write to file
|
|
506
|
+ output_file_name = '%s.wbg-sol.info' % args.input_file
|
|
507
|
+ output_file = open(output_file_name, 'w')
|
|
508
|
+ output_file.write(out_str)
|
|
509
|
+ output_file.close()
|
|
510
|
+
|
|
511
|
+ def get_stats_ccia2019(self, authors, args):
|
|
512
|
+ '''
|
|
513
|
+ Get statistics based on the CCIA 2019 paper
|
|
514
|
+ '''
|
|
515
|
+ for a in authors:
|
|
516
|
+ # Radicality
|
|
517
|
+ authors[a]['radicality'] = float(authors[a]['sum_sentiment_radicality']) / authors[a]['num_init_nodes']
|
|
518
|
+ # Attention generator
|
|
519
|
+ authors[a]['attention_generator_pos'] = float(authors[a]['num_in_edges_pos']) / authors[a]['num_nodes']
|
|
520
|
+ authors[a]['attention_generator_neg'] = float(authors[a]['num_in_edges_neg']) / authors[a]['num_nodes']
|
|
521
|
+ # Author polarization --> [-1, 1]
|
|
522
|
+ if (authors[a]['num_nodes_yes_fav_in_sol'] + authors[a]['num_nodes_not_fav_in_sol']) > 0:
|
|
523
|
+ authors[a]['pol_sol'] = float(authors[a]['num_nodes_yes_fav_in_sol'] - authors[a]['num_nodes_not_fav_in_sol']) / (authors[a]['num_nodes_yes_fav_in_sol'] + authors[a]['num_nodes_not_fav_in_sol'])
|
|
524
|
+ else:
|
|
525
|
+ authors[a]['pol_sol'] = None
|
|
526
|
+ authors[a]['pol'] = float(authors[a]['num_nodes_yes_fav'] - authors[a]['num_nodes_not_fav']) / (authors[a]['num_nodes_yes_fav'] + authors[a]['num_nodes_not_fav'])
|
|
527
|
+ # max(|score|) for all author comments
|
|
528
|
+ authors[a]['max_abs_score'] = max(map(abs, authors[a]['scores']))
|
|
529
|
+ # sum(|score|) for all author comments
|
|
530
|
+ authors[a]['sum_abs_score'] = sum(map(abs, authors[a]['scores']))
|
|
531
|
+ # sum(score) for all author comments
|
|
532
|
+ authors[a]['sum_score'] = sum(authors[a]['scores'])
|
|
533
|
+ # number of author comments
|
|
534
|
+ authors[a]['num_comments'] = len(authors[a]['scores'])
|
|
535
|
+
|
|
536
|
+ out_str = 'CCIA 2019 stats\n---------------\n'
|
|
537
|
+ for ordering in args.ccia2019_order:
|
|
538
|
+ # Sort by authors relevance choice (ccia2019_order paramater)
|
|
539
|
+ sorted_authors = sorted(authors.items(), key = lambda a: a[1][ordering], reverse = True)
|
|
540
|
+
|
|
541
|
+ out_str += 'Number of authors: %i\n' % len(authors)
|
|
542
|
+ out_str += 'Sorted by: %s\n' % ordering
|
|
543
|
+
|
|
544
|
+ # Output top X authors data
|
|
545
|
+ data = ['author', 'max_abs_score', 'sum_abs_score', 'sum_score', 'num_comments', 'radicality', 'att_gen_pos', 'att_gen_neg', 'polarization']
|
|
546
|
+ out_str += format_data(data)
|
|
547
|
+ for a in sorted_authors[:20]:
|
|
548
|
+ data = [a[0], a[1]['max_abs_score'], a[1]['sum_abs_score'], a[1]['sum_score'], a[1]['num_comments'], a[1]['radicality'], a[1]['attention_generator_pos'], a[1]['attention_generator_neg'], a[1]['pol']]
|
|
549
|
+ out_str += format_data(data)
|
|
550
|
+
|
|
551
|
+ return out_str
|
|
552
|
+
|
|
553
|
+ def prca2019_authors_relevance(self, args):
|
|
554
|
+ '''
|
|
555
|
+ Compute relevance(u) = sum_{c in Gamma | user(c) = u} W(score(c))
|
|
556
|
+ '''
|
|
557
|
+ print(' Computing authors relevance...')
|
|
558
|
+ authors = {}
|
|
559
|
+ for n, nd in self.PDebT.nodes(data = True):
|
|
560
|
+ a = self.PDebT.nodes[n]['data'].get('author')
|
|
561
|
+ if a not in authors:
|
|
562
|
+ authors[a] = {}
|
|
563
|
+ authors[a]['wscores'] = [] # List of scores of the author's comments
|
|
564
|
+ authors[a]['wscores'].append(scale_weight(int(self.PDebT.nodes[n]['data'].get('score')), args))
|
|
565
|
+ for a in authors:
|
|
566
|
+ authors[a]['sum_wscore'] = sum(authors[a]['wscores'])
|
|
567
|
+ return sorted(authors.items(), key = lambda a: a[1]['sum_wscore'], reverse = True)
|
|
568
|
+
|
|
569
|
+ def prca2019_remove_author(self, author, G):
|
|
570
|
+ '''
|
|
571
|
+ Return a copy of WBDebG with author removed (RDebT, Restricted DebT)
|
|
572
|
+ '''
|
|
573
|
+ # Copy graph
|
|
574
|
+ res_G = networkx.DiGraph()
|
|
575
|
+ res_G.add_nodes_from(G.nodes(data = True))
|
|
576
|
+ for e1, e2, ed in G.edges(data = True):
|
|
577
|
+ res_G.add_edge(e1, e2, data = G[e1][e2]['data'])
|
|
578
|
+ # Remove author
|
|
579
|
+ rg = res_G.reverse() # Reverse graph (direction of edges reversed)
|
|
580
|
+ for n, nd in res_G.nodes(data = True):
|
|
581
|
+ if nd['data'].get('author') == author:
|
|
582
|
+ if n in res_G and nd['data'].get('comment_id') != self.root_id: # Not already removed (appears in a previous subtree) and not root node
|
|
583
|
+ # Get subtree of node n and remove it
|
|
584
|
+ st = networkx.algorithms.traversal.depth_first_search.dfs_tree(rg, n)
|
|
585
|
+ res_G.remove_nodes_from(st.nodes())
|
|
586
|
+ return res_G
|
|
587
|
+
|
|
588
|
+ def prca2019_get_stats(self, G):
|
|
589
|
+ '''
|
|
590
|
+ Get the stats needed of the authors' list from G
|
|
591
|
+ '''
|
|
592
|
+ G_stats = {}
|
|
593
|
+ G_stats['num_comments'] = G.number_of_nodes() # Number of comments
|
|
594
|
+ G_stats['list_comments_id'] = list(G.nodes()) # List of comments id (Reddit id)
|
|
595
|
+ G_stats['list_nodes_id'] = map(lambda x: int(self.comment_id[x]), G_stats['list_comments_id']) # List of nodes id [0, ...] (0 = root node)
|
|
596
|
+ G_stats['list_nodes_id_Cplus'] = [int(self.comment_id[n]) for n, nd in G.nodes(data = True) if nd['bipartite_set'] == 1] # List of nodes id in favor of root node
|
|
597
|
+ G_stats['list_nodes_id_Cminus'] = [int(self.comment_id[n]) for n, nd in G.nodes(data = True) if nd['bipartite_set'] == -1] # List of nodes id NOT in favor of root node
|
|
598
|
+ return G_stats
|
|
599
|
+
|
|
600
|
+ def prca2019_analysis(self, args):
|
|
601
|
+ '''
|
|
602
|
+ Perform PRL VSI PR&CA 2019 analysis
|
|
603
|
+ '''
|
|
604
|
+ print('Performing VSI PR&CA 2019 analysis...')
|
|
605
|
+ tmp_file_name = '/tmp/tmp-reddit-at-WBDebG.tmp.xml'
|
|
606
|
+ sorted_authors = self.prca2019_authors_relevance(args) # Most relevant authors
|
|
607
|
+ PDebT_stats = self.prca2019_get_stats(self.PDebT)
|
|
608
|
+ res_PDebT_stats = {}
|
|
609
|
+ res_VAF_accepted = {}
|
|
610
|
+ ai = 0
|
|
611
|
+ for a, ad in sorted_authors[:args.prca2019]:
|
|
612
|
+ print(' Analysing author "%s" (%i/%i)...' % (a, ai + 1, min(args.prca2019, len(sorted_authors))))
|
|
613
|
+ res_PDebT = self.prca2019_remove_author(a, self.PDebT)
|
|
614
|
+ res_PDebT_stats[a] = self.prca2019_get_stats(res_PDebT)
|
|
615
|
+ res_WBDebG = self.PDebG_to_WBDebG(res_PDebT)
|
|
616
|
+ self.WBDebG_to_xml(args, res_WBDebG, tmp_file_name)
|
|
617
|
+ res_VAF_accepted[a] = VAF_solver(args, tmp_file_name)
|
|
618
|
+ ai = ai + 1
|
|
619
|
+ self.prca2019_stats_to_file(sorted_authors, PDebT_stats, res_PDebT_stats, res_VAF_accepted, args)
|
|
620
|
+
|
|
621
|
+ def prca2019_stats_to_file(self, sorted_authors, PDebT_stats, res_PDebT_stats, res_VAF_accepted, args):
|
|
622
|
+ '''
|
|
623
|
+ Compute PRL VSI PR&CA 2019 stats and outputs them to file
|
|
624
|
+ '''
|
|
625
|
+ output_file_name = '%s.prca2019.info' % args.input_file
|
|
626
|
+ print('Writting statistics to file "%s"...' % output_file_name)
|
|
627
|
+ out_str = 'Input file: %s\n' % args.input_file
|
|
628
|
+ out_str += 'Number of authors: %i\n' % len(sorted_authors)
|
|
629
|
+ out_str += 'Number of comments: %i\n' % PDebT_stats['num_comments']
|
|
630
|
+ data = ['author', '#comments', 'relevance', 'engaging', 'influence', 'rebalancing', 'rebalancing2']
|
|
631
|
+ out_str += format_data(data)
|
|
632
|
+ # Data from initial graphs and solution with all users
|
|
633
|
+ Ca = frozenset(PDebT_stats['list_nodes_id'])
|
|
634
|
+ Cplus = frozenset(PDebT_stats['list_nodes_id_Cplus'])
|
|
635
|
+ Cminus = frozenset(PDebT_stats['list_nodes_id_Cminus'])
|
|
636
|
+ S = frozenset(self.VAF_accepted)
|
|
637
|
+ polS = (len(S & Cplus) - len(S & Cminus)) / float(len(S))
|
|
638
|
+ lengaging = []
|
|
639
|
+ linfluence = []
|
|
640
|
+ lrebalancing = []
|
|
641
|
+ lrebalancing2 = []
|
|
642
|
+ for a, ad in sorted_authors[:args.prca2019]:
|
|
643
|
+ # Data of the restricted (without user) graphs and solution
|
|
644
|
+ Cau = frozenset(res_PDebT_stats[a]['list_nodes_id'])
|
|
645
|
+ Cplusu = frozenset(res_PDebT_stats[a]['list_nodes_id_Cplus'])
|
|
646
|
+ Cminusu = frozenset(res_PDebT_stats[a]['list_nodes_id_Cminus'])
|
|
647
|
+ Su = frozenset(res_VAF_accepted[a])
|
|
648
|
+ polSu = (len(Su & Cplusu) - len(Su & Cminusu)) / float(len(Su))
|
|
649
|
+ polSCau = (len(S & Cplus & Cau) - len(S & Cminus & Cau)) / float(len(S & Cau))
|
|
650
|
+ # engaging(u) = #(Ca \ Cau) / #Ca, conversation remaining after removing user
|
|
651
|
+ engaging = len(Ca - Cau) / float(len(Ca))
|
|
652
|
+ lengaging.append(engaging)
|
|
653
|
+ # influence(u) = (#((Cau \ Su) \cap S) + #(Su \cap (Ca \ S))) / #Cau, (u moved to S + u moved outside S) / # u comments
|
|
654
|
+ influence = (len((Cau - Su) & S) + len(Su & (Ca - S))) / float(len(Cau))
|
|
655
|
+ linfluence.append(influence)
|
|
656
|
+ # rebalancing(u) = |polarization(S) - polarization(Su)|, polarization(S) = (#(S \cap C+) - #(S \cap C-)) / #S, absolute change in polarization after removing user
|
|
657
|
+ rebalancing = abs(polS - polSu)
|
|
658
|
+ lrebalancing.append(rebalancing)
|
|
659
|
+ # rebalancing2(u) = |polarization(S \cap Cu) - polarization(Su)|, polarization(S) = (#(S \cap C+) - #(S \cap C-)) / #S, absolute change in polarization after removing user
|
|
660
|
+ rebalancing2 = abs(polSCau - polSu)
|
|
661
|
+ lrebalancing2.append(rebalancing2)
|
|
662
|
+ # Add row to output string
|
|
663
|
+ data = [a, len(ad['wscores']), ad['sum_wscore'], engaging, influence, rebalancing, rebalancing2]
|
|
664
|
+ out_str += format_data(data)
|
|
665
|
+ data = ['Mean', '', '', sum(lengaging) / len(lengaging), sum(linfluence) / len(linfluence), sum(lrebalancing) / len(lrebalancing), sum(lrebalancing2) / len(lrebalancing2)]
|
|
666
|
+ out_str += format_data(data)
|
|
667
|
+ # Write to file
|
|
668
|
+ output_file = open(output_file_name, 'w')
|
|
669
|
+ output_file.write(out_str)
|
|
670
|
+ output_file.close()
|
|
671
|
+
|
|
672
|
+# Functions
|
|
673
|
+
|
|
674
|
+def args2str(args):
|
|
675
|
+ argstr = '=== Begin arguments ===\n'
|
|
676
|
+ argstr += 'Input file: %s\n' % args.input_file
|
|
677
|
+ argstr += 'Alpha parameter: %f\n' % args.alpha
|
|
678
|
+ argstr += 'Algorithm: %s\n' % args.algorithm
|
|
679
|
+ argstr += 'Log scale base: %i\n' % args.log_base
|
|
680
|
+ argstr += 'Socialarg git path: %s\n' % args.socialarg_path
|
|
681
|
+ argstr += 'Spark path: %s\n' % args.spark_path
|
|
682
|
+ argstr += 'Draw graphs: %s\n' % args.draw_graphs
|
|
683
|
+ argstr += 'Neutral comments: %s\n' % args.neutral_comments
|
|
684
|
+ argstr += 'CCIA2019 author ordering: %s\n' % args.ccia2019_order
|
|
685
|
+ argstr += 'VSI PR&CA 2019 analysis: %i\n' % args.prca2019
|
|
686
|
+ argstr += 'User-based analysis: %s\n' % args.user
|
|
687
|
+ argstr += 'User valuation: %s\n' % args.user_valuation
|
|
688
|
+ argstr += 'SCIP output: %s\n' % args.scip_output
|
|
689
|
+ argstr += 'Parameters: %s\n' % args.params
|
|
690
|
+ argstr += 'Random seed: %s\n' % args.seed
|
|
691
|
+ argstr += '=== End arguments ==='
|
|
692
|
+ return argstr
|
|
693
|
+
|
|
694
|
+def scale_weight(weight, args):
|
|
695
|
+ '''Scales the weight using a log function'''
|
|
696
|
+ if weight >= 1:
|
|
697
|
+ return int(math.floor(math.log(weight, args.log_base)) + 1)
|
|
698
|
+ else:
|
|
699
|
+ return 0
|
|
700
|
+
|
|
701
|
+def VAF_solver(args, input_file_name = None):
|
|
702
|
+ '''
|
|
703
|
+ Solves the discusion using the VAF solver and return the accepte nodes
|
|
704
|
+ '''
|
|
705
|
+ if input_file_name:
|
|
706
|
+ print(' Solving graph with VAF solver...')
|
|
707
|
+ input_file_name = input_file_name[:-4]
|
|
708
|
+ else:
|
|
709
|
+ print('Solving graph with VAF solver...')
|
|
710
|
+ input_file_name = '%s.wbg' % args.input_file
|
|
711
|
+ output_file_name = '/tmp/tmp-reddit-at.out'
|
|
712
|
+ # Check files
|
|
713
|
+ if os.path.isdir(os.path.expanduser(args.spark_path)):
|
|
714
|
+ args.spark_path = os.path.abspath(os.path.expanduser(args.spark_path))
|
|
715
|
+ spark_submit = '%s/bin/spark-submit' % args.spark_path
|
|
716
|
+ if not os.path.exists(spark_submit):
|
|
717
|
+ sys.exit('ERROR: spark-submit not found at "%s".' % spark_submit)
|
|
718
|
+ else:
|
|
719
|
+ sys.exit('ERROR: Spark folder not found "%s".' % args.spark_path)
|
|
720
|
+ if os.path.isdir(os.path.expanduser(args.socialarg_path)):
|
|
721
|
+ args.socialarg_path = os.path.abspath(os.path.expanduser(args.socialarg_path))
|
|
722
|
+ #analyzer_jar = '%s/distributed_social_network_analyzer/target/scala-2.11/social-network-analyzer_2.11-1.0.jar' % args.socialarg_path
|
|
723
|
+ analyzer_jar = '%s/distributed_social_network_analyzer/target/scala-2.12/social-network-analyzer_2.12-1.0.jar' % args.socialarg_path
|
|
724
|
+ if not os.path.exists(analyzer_jar):
|
|
725
|
+ sys.exit('ERROR: analyzer jar file not found at "%s".' % analyzer_jar)
|
|
726
|
+ else:
|
|
727
|
+ sys.exit('ERROR: socialarg git repo folder not found "%s".' % args.socialarg_path)
|
|
728
|
+ # Run solver
|
|
729
|
+ cmd = '(time %s --master local[4] --class "MainAppFromXML" %s %s) &> %s' % (spark_submit, analyzer_jar, input_file_name, output_file_name)
|
|
730
|
+ os.system(cmd)
|
|
731
|
+ # Parse output
|
|
732
|
+ accepted = []
|
|
733
|
+ try:
|
|
734
|
+ xml = ET.parse('%s-xml.sol' % input_file_name)
|
|
735
|
+ answer = xml.find('answer')
|
|
736
|
+ for a in answer.iter('arg'):
|
|
737
|
+ accepted.append(int(a.attrib['id']))
|
|
738
|
+ except:
|
|
739
|
+ sys.exit('ERROR: something happened while parsing solver output "%s-xml.sol".' % input_file_name)
|
|
740
|
+ return accepted
|
|
741
|
+
|
|
742
|
+def format_data(data):
|
|
743
|
+ '''
|
|
744
|
+ Format data list for an output of fixed column width
|
|
745
|
+ '''
|
|
746
|
+ width = [20, 9, 9, 9, 9, 11, 12] + [12] * len(data)
|
|
747
|
+ sep = '|'
|
|
748
|
+ eol = '\n'
|
|
749
|
+ out_str = ''
|
|
750
|
+ while data:
|
|
751
|
+ w = width.pop(0)
|
|
752
|
+ d = data.pop(0)
|
|
753
|
+ if isinstance(d, float) and d != 0:
|
|
754
|
+ data_str = '{:0.10f}'.format(d)
|
|
755
|
+ else:
|
|
756
|
+ data_str = str(d)
|
|
757
|
+ out_str += data_str[:w].ljust(w)
|
|
758
|
+ if data:
|
|
759
|
+ out_str += sep
|
|
760
|
+ out_str += eol
|
|
761
|
+ return out_str
|
|
762
|
+
|
|
763
|
+# Main
|
|
764
|
+
|
|
765
|
+if __name__ == '__main__' :
|
|
766
|
+ # Parse arguments
|
|
767
|
+ parser = argparse.ArgumentParser(description = 'Reddit Analysis Tool.')
|
|
768
|
+ # Optional arguments
|
|
769
|
+ parser.add_argument('-a', '--alpha', default = 0.5, type = float, help = 'Alpha parameter used as threshold for several functions (default: 0.5)', dest = 'alpha')
|
|
770
|
+ parser.add_argument('-al', '--algorithm', type = str, default = 'g0', help = 'Algorithm and parameters in case available, see docstring for more information (default: g0)', dest = 'algorithm')
|
|
771
|
+ parser.add_argument('--ccia2019_order', nargs = '+', type = str, choices = ['max_abs_score', 'sum_abs_score', 'sum_score', 'num_comments'], help = 'Author ordering for CCIA 2019 stats (default: max_abs_score)', dest = 'ccia2019_order')
|
|
772
|
+ parser.add_argument('-d', '--draw_graphs', action = 'store_true', default = False, help = 'Draws the grafs of all the steps of the analysis (default: False)', dest = 'draw_graphs')
|
|
773
|
+ parser.add_argument('-if', '--input_file', default = None, type = str, help = 'Input file name of the xml with the Reddit post information', dest = 'input_file')
|
|
774
|
+ parser.add_argument('-lb', '--log_base', default = 10, type = int, help = 'Logarithmic scale base for weighting (default: 10)', dest = 'log_base')
|
|
775
|
+ parser.add_argument('-nc', '--neutral_comments', nargs = '?', type = str, default = 'do_nothing', choices = ['do_nothing', 'remove_subtree', 'to_positive'], help = 'Neutral comments treatment (default: do_nothing)', dest = 'neutral_comments')
|
|
776
|
+ parser.add_argument('-p', '--params', default = None, type = str, help = 'Argument used to specify parameters for some functionalities', dest = 'params')
|
|
777
|
+ parser.add_argument('--prca2019', default = 0, type = int, help = 'PRL VSI in PR&CA 2019 analysis (default: 0)', dest = 'prca2019')
|
|
778
|
+ parser.add_argument('-s', '--seed', default = None, type = str, help = 'Seed to initialize random numbers (default: None)', dest = 'seed')
|
|
779
|
+ parser.add_argument('-so', '--scip_output', action = 'store_true', default = False, help = 'Outputs UDebG files in SCIP format to solve bipartition problem (default: False)', dest = 'scip_output')
|
|
780
|
+ parser.add_argument('--socialarg_path', default = '~/git/socialarg', type = str, help = 'Path to the socialarg git repo (default: ~/git/socialarg)', dest = 'socialarg_path')
|
|
781
|
+ parser.add_argument('-sp', '--spark_path', default = '~/.local/spark-2.2.1-bin-hadoop2.7', type = str, help = 'Spark path (default: ~/.local/spark-2.2.1-bin-hadoop2.7)', dest = 'spark_path')
|
|
782
|
+ parser.add_argument('-u', '--user', default = None, type = str, choices = ['mdai2020', 'wia2021'], help = 'User-based analysis (default: None)', dest = 'user')
|
|
783
|
+ parser.add_argument('-uv', '--user_valuation', default = None, type = str, choices = ['comment_karma', 'sum_scores'], help = 'User valuation for a VAF over UDebG (default: None)', dest = 'user_valuation')
|
|
784
|
+ args = parser.parse_args()
|
|
785
|
+ print(args2str(args))
|
|
786
|
+
|
|
787
|
+ # No input file defined
|
|
788
|
+ if not args.input_file:
|
|
789
|
+ # Generate random UDebG
|
|
790
|
+ udebg.UDebG(None, None, args)
|
|
791
|
+ exit()
|
|
792
|
+ # Read debate and create inital Graph
|
|
793
|
+ rg = RedditG(args.input_file)
|
|
794
|
+ # User-oriented analysis
|
|
795
|
+ if args.user:
|
|
796
|
+ if args.user == 'wia2021':
|
|
797
|
+ rg.wia2021_DebT(args)
|
|
798
|
+ rg.wia2021_SDebT(args)
|
|
799
|
+ udebg.UDebG(rg.SDebT, rg.root_id, args)
|
|
800
|
+ exit()
|
|
801
|
+ # Perform analysis (WBDebG)
|
|
802
|
+ rg.ccia18_analysis(args)
|
|
803
|
+ # Output results
|
|
804
|
+ rg.WBDebG2xml(args)
|
|
805
|
+ if args.draw_graphs:
|
|
806
|
+ rg.draw_ccia18_PCT(args)
|
|
807
|
+ rg.draw_ccia18_WBG(args)
|
|
808
|
+ # Compute solution using VAF solver
|
|
809
|
+ rg.VAF_accepted = VAF_solver(args)
|
|
810
|
+ # Output results with solution
|
|
811
|
+ if args.draw_graphs:
|
|
812
|
+ rg.draw_ccia18_WBG(args)
|
|
813
|
+ # Compute stats
|
|
814
|
+ if args.prca2019:
|
|
815
|
+ rg.prca2019_analysis(args)
|
|
816
|
+ else:
|
|
817
|
+ rg.ccia2018_stats_to_file(args)
|