Research data available for everyone.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817
  1. #!/usr/bin/python3 -ttOO
  2. '''
  3. Artificial Intelligence Research Group
  4. University of Lleida
  5. '''
  6. # Libraries
  7. import os
  8. import sys
  9. import argparse
  10. import xml.etree.ElementTree as ET
  11. import math
  12. import networkx
  13. import ast
  14. import udebg
  15. # Classes
  16. class RedditG():
  17. '''
  18. Reddit post in graph structure for its analysis
  19. '''
  20. def __init__(self, input_file):
  21. self.xml = None
  22. self.root_id = None
  23. self.DebT = networkx.DiGraph() # Debate Tree
  24. self.PDebT = networkx.DiGraph() # Pruned Debate Tree
  25. self.SDebT = networkx.DiGraph() # Two-Sided Debate Tree
  26. self.WBDebG = None # Weighted Bipartite Debate Graph
  27. self.comment_id = {}
  28. self.min_weight = None
  29. self.max_weight = None
  30. self.VAF_accepted = None
  31. self.read_xml(input_file)
  32. def remove_deleted_comments(self):
  33. rg = self.DebT.reverse() # Reverse graph (direction of edges reversed)
  34. while True: # To avoid "RuntimeError: dictionary changed size during iteration" in inner loop
  35. changed = False
  36. for n, nd in self.DebT.nodes(data = True):
  37. if nd['data'].get('author') == 'None':
  38. if n in self.DebT: # Not already removed (appears in a previous subtree)
  39. # Get subtree of node n and remove it
  40. st = networkx.algorithms.traversal.depth_first_search.dfs_tree(rg, n)
  41. self.DebT.remove_nodes_from(st.nodes())
  42. changed = True
  43. break
  44. if not changed:
  45. break
  46. def read_xml(self, input_file):
  47. '''
  48. Read XML file with the conversation
  49. '''
  50. print('Reading xml input file...')
  51. self.xml = ET.parse(input_file)
  52. al = self.xml.find('argument-list')
  53. for arg in al.iter('arg'):
  54. if 'title' in arg.attrib:
  55. self.root_id = arg.attrib['id']
  56. if arg.attrib['author'] == 'None': # To not delete full tree when root node author is deleted
  57. arg.attrib['author'] = 'root_node_author'
  58. self.DebT.add_node(arg.attrib['id'], data = arg)
  59. ap = self.xml.find('argument-pairs')
  60. for pair in ap.iter('pair'): # Argument pair (relation) t replies to h
  61. self.DebT.add_edge(pair.find('t').get('id'), pair.find('h').get('id'), data = pair)
  62. self.remove_deleted_comments()
  63. def wia2021_DebT(self, args):
  64. '''
  65. DebT for wia2021
  66. '''
  67. print('Generating DebT for wia2021...')
  68. # Set chronological id to comments in DebT
  69. id_list = sorted([n for n, nd in self.DebT.nodes(data = True) if 'title' not in nd['data'].attrib])
  70. for i, c_id in enumerate(id_list):
  71. self.DebT.nodes[c_id]['chrono_id'] = i + 1 # chrono_id for root node set below
  72. print(' Number of nodes DebT = {}'.format(self.DebT.number_of_nodes()))
  73. # Initializations
  74. self.DebT.nodes[self.root_id]['sentiment_not_normalized'] = 0
  75. self.DebT.nodes[self.root_id]['chrono_id'] = 0
  76. # BFS on DebT to compute sentiment not normalized [-2, 2]
  77. list_edges = [e for e in self.DebT.in_edges(self.root_id)]
  78. while list_edges:
  79. current_edge = list_edges.pop(0)
  80. node_id = current_edge[0]
  81. sentiment = self.ccia18_sentiment(False, ast.literal_eval(self.DebT.nodes[node_id]['data'].get('sentiment_distribution')), args)
  82. self.DebT.nodes[node_id]['sentiment_not_normalized'] = sentiment
  83. list_edges.extend([e for e in self.DebT.in_edges(node_id)])
  84. if args.draw_graphs:
  85. self.wia2021_draw_DebT(args)
  86. def wia2021_SDebT(self, args):
  87. '''
  88. SDebT for wia2021
  89. '''
  90. print('Generating SDebT for wia2021...')
  91. # Copy DebT to SDebT
  92. self.SDebT.add_nodes_from(self.DebT.nodes(data = True))
  93. for e1, e2, ed in self.DebT.edges(data = True):
  94. self.SDebT.add_edge(e1, e2, data = self.DebT[e1][e2]['data'])
  95. # Initializations
  96. self.SDebT.nodes[self.root_id]['side'] = 1
  97. # BFS to compute the side of each node
  98. list_edges = [e for e in self.SDebT.in_edges(self.root_id)]
  99. while list_edges:
  100. current_edge = list_edges.pop(0)
  101. node_id = current_edge[0]
  102. parent_node_id = current_edge[1]
  103. if (self.SDebT.nodes[parent_node_id]['side'] == 1 and self.DebT.nodes[node_id]['sentiment_not_normalized'] > 0) or (self.SDebT.nodes[parent_node_id]['side'] == -1 and self.DebT.nodes[node_id]['sentiment_not_normalized'] <= 0):
  104. self.SDebT.nodes[node_id]['side'] = 1
  105. else:
  106. self.SDebT.nodes[node_id]['side'] = -1
  107. list_edges.extend([e for e in self.SDebT.in_edges(node_id)])
  108. if args.draw_graphs:
  109. self.wia2021_draw_SDebT(args)
  110. def wia2021_draw_DebT(self, args):
  111. '''
  112. Drawing wia2021 DebT
  113. '''
  114. print('Drawing wia2021 DebT...')
  115. gv = networkx.nx_agraph.to_agraph(self.DebT)
  116. gv.node_attr['style'] = 'filled'
  117. gv.node_attr['fixedsize'] = 'true'
  118. gv.node_attr['width'] = '0.4'
  119. gv.node_attr['height'] = '0.4'
  120. gv.node_attr['fillcolor'] = '#0000FF'
  121. gv.node_attr['fontcolor'] = '#FFFFFF'
  122. for n in gv.nodes():
  123. n.attr['label'] = str(self.DebT.nodes[n]['chrono_id'])
  124. gv.edge_attr['color'] = '#000000'
  125. for e in gv.edges():
  126. s = self.DebT.nodes[e[0]]['sentiment_not_normalized']
  127. if s > 0:
  128. contrast, color = udebg.get_weighted_color([0x00, 0xFF, 0x00], 0, 2, s)
  129. e.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, color)])
  130. elif s < 0:
  131. contrast, color = udebg.get_weighted_color([0xFF, 0x00, 0x00], 0, 2, -s)
  132. e.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, color)])
  133. gv.layout(prog = 'dot', args='-Goverlap=false -Gnodesep=0.2 -Granksep=0.2 -Grankdir=BT -GK=800 -Gstart=17 -Gmaxiter=600')
  134. gv.draw("%s.debt.png" % args.input_file, format = 'png')
  135. def wia2021_draw_SDebT(self, args):
  136. '''
  137. Drawing wia2021 SDebT
  138. '''
  139. print('Drawing wia2021 SDebT...')
  140. gv = networkx.nx_agraph.to_agraph(self.SDebT)
  141. gv.node_attr['style'] = 'filled'
  142. gv.node_attr['fixedsize'] = 'true'
  143. gv.node_attr['width'] = '0.4'
  144. gv.node_attr['height'] = '0.4'
  145. gv.node_attr['fillcolor'] = '#0000FF'
  146. gv.node_attr['fontcolor'] = '#FFFFFF'
  147. for n in gv.nodes():
  148. n.attr['label'] = str(self.SDebT.nodes[n]['chrono_id'])
  149. side = self.SDebT.nodes[n]['side']
  150. if side == 1:
  151. n.attr['fontcolor'] = '#000000'
  152. n.attr['fillcolor'] = '#4FCFFF' # light green = '#6FFF6F', cyan = '#4FCFFF'
  153. else:
  154. n.attr['fontcolor'] = '#FFFFFF'
  155. n.attr['fillcolor'] = '#00007F' # light red = '#FF6F6F', dark blue = '#00007F'
  156. gv.edge_attr['color'] = '#000000'
  157. for e in gv.edges():
  158. s = self.SDebT.nodes[e[0]]['sentiment_not_normalized']
  159. if s > 0:
  160. contrast, color = udebg.get_weighted_color([0x00, 0xFF, 0x00], 0, 2, s)
  161. e.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, color)])
  162. elif s < 0:
  163. contrast, color = udebg.get_weighted_color([0xFF, 0x00, 0x00], 0, 2, -s)
  164. e.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, color)])
  165. gv.layout(prog = 'dot', args='-Goverlap=false -Gnodesep=0.2 -Granksep=0.2 -Grankdir=BT -GK=800 -Gstart=17 -Gmaxiter=600')
  166. gv.draw("%s.sdebt.png" % args.input_file, format = 'png')
  167. def ccia18_sentiment(self, normalized, sentiment_distribution, args):
  168. '''
  169. Computes the sentiment from a sentiment distribution of 5 values
  170. Normalized: [very neg, neg, neutral, pos, very pos] --> [neg = -1, neutral = 0, pos = 1]
  171. Not normalized: --> [-2, 2]
  172. '''
  173. sentiment_relevance = [-2, -1, 0, 1, 2]
  174. res = [a * b for a, b in zip(sentiment_relevance, sentiment_distribution)]
  175. res = sum(res)
  176. if not normalized:
  177. return res
  178. if abs(res) > args.alpha:
  179. if res > 0:
  180. return 1 # Positive
  181. else:
  182. return -1 # Negative
  183. else:
  184. return 0 # Neutral
  185. def get_node_color(self, base_color, w):
  186. hw = 0xCF
  187. if w >= self.max_weight:
  188. hw = 0
  189. elif self.max_weight > self.min_weight:
  190. hw = int(hw * (float(self.max_weight - w) / float(self.max_weight - self.min_weight)))
  191. color = [a | b for a, b in zip(base_color, [hw, hw, hw])]
  192. return color
  193. def ccia18_analysis(self, args):
  194. '''
  195. Weighted Bipartite Graph analysis
  196. '''
  197. print('Generating PDebT and WBDebG...')
  198. # Copy DebT to PDebT
  199. self.PDebT.add_nodes_from(self.DebT.nodes(data = True))
  200. for e1, e2, ed in self.DebT.edges(data = True):
  201. self.PDebT.add_edge(e1, e2, data = self.DebT[e1][e2]['data'])
  202. # Initializations
  203. self.PDebT.nodes[self.root_id]['bipartite_set'] = 1 # 1 in favor of root, -1 not in favor
  204. self.PDebT.nodes[self.root_id]['sentiment'] = 1
  205. self.DebT.nodes[self.root_id]['sentiment_not_normalized'] = 0
  206. rg = self.PDebT.reverse() # Reverse graph (direction of edges reversed)
  207. # DFS on PDebT before removing nodes to save DebT sentiment not normalized
  208. list_edges = [e for e in self.PDebT.in_edges(self.root_id)]
  209. self.PDebT.nodes[self.root_id]['depth'] = 1
  210. max_depth = 1
  211. while list_edges:
  212. current_edge = list_edges.pop()
  213. node_id = current_edge[0]
  214. self.PDebT.nodes[node_id]['depth'] = self.PDebT.nodes[current_edge[1]]['depth'] + 1
  215. if self.PDebT.nodes[node_id]['depth'] > max_depth:
  216. max_depth = self.PDebT.nodes[node_id]['depth']
  217. sentiment = self.ccia18_sentiment(False, ast.literal_eval(self.PDebT.nodes[node_id]['data'].get('sentiment_distribution')), args)
  218. self.DebT.nodes[node_id]['sentiment_not_normalized'] = sentiment
  219. list_edges.extend([e for e in self.PDebT.in_edges(node_id)])
  220. self.PDebT.nodes[self.root_id]['max_depth'] = max_depth
  221. # DFS and prune PDebT
  222. list_edges = [e for e in self.PDebT.in_edges(self.root_id)]
  223. while list_edges:
  224. current_edge = list_edges.pop()
  225. node_id = current_edge[0]
  226. father_id = current_edge[1]
  227. sentiment = self.ccia18_sentiment(True, ast.literal_eval(self.PDebT.nodes[node_id]['data'].get('sentiment_distribution')), args)
  228. if sentiment == 1: # Positive
  229. self.PDebT.nodes[node_id]['bipartite_set'] = self.PDebT.nodes[father_id]['bipartite_set']
  230. elif sentiment == -1: # Negative
  231. self.PDebT.nodes[node_id]['bipartite_set'] = -self.PDebT.nodes[father_id]['bipartite_set']
  232. if sentiment == 0: # Neutral: remove subtree
  233. st = networkx.algorithms.traversal.depth_first_search.dfs_tree(rg, node_id)
  234. self.PDebT.remove_nodes_from(st.nodes())
  235. else: # Not Neutral
  236. self.PDebT.nodes[node_id]['sentiment'] = sentiment
  237. list_edges.extend([e for e in self.PDebT.in_edges(node_id)])
  238. # Create the WBDebG
  239. self.WBDebG = self.PDebG_to_WBDebG(self.PDebT)
  240. def PDebG_to_WBDebG(self, PDebT):
  241. '''
  242. Create the WBDebG from the PDebT
  243. '''
  244. WBDebG = networkx.DiGraph()
  245. WBDebG.add_nodes_from(PDebT.nodes(data = True))
  246. for e1, e2, ed in PDebT.edges(data = True):
  247. if WBDebG.nodes[e1]['bipartite_set'] != WBDebG.nodes[e2]['bipartite_set']:
  248. WBDebG.add_edge(e1, e2, data = PDebT[e1][e2]['data'])
  249. return WBDebG
  250. def WBDebG2xml(self, args):
  251. '''
  252. Saves self.WBDebG graph to xml file
  253. '''
  254. xml = ET.Element('entailment-corpus')
  255. xml.append(ET.Comment(args2str(args)))
  256. al_xml = ET.SubElement(xml, 'argument-list')
  257. i = 1 # 0 for root
  258. maxw = minw = scale_weight(int(self.WBDebG.nodes[self.root_id]['data'].get('score')), args)
  259. for n_id, nd in self.WBDebG.nodes(data = True):
  260. a = nd['data']
  261. w = scale_weight(int(a.get('score')), args)
  262. self.WBDebG.nodes[n_id]['weight'] = w
  263. if w < minw:
  264. minw = w
  265. elif w > maxw:
  266. maxw = w
  267. a.set('weight', str(w))
  268. a.set('bipartite_set', str(self.WBDebG.nodes[n_id]['bipartite_set']))
  269. a.set('comment_id', a.get('id'))
  270. if a.get('id') == self.root_id:
  271. # Id 0 for root node
  272. self.comment_id[a.get('id')] = '0'
  273. a.set('id', '0')
  274. else:
  275. self.comment_id[a.get('id')] = str(i)
  276. a.set('id', str(i))
  277. i = i + 1
  278. al_xml.append(a)
  279. al_xml.set('minweight', str(minw))
  280. al_xml.set('maxweight', str(maxw))
  281. self.min_weight = minw
  282. self.max_weight = maxw
  283. xml.set('num_nodes', str(i))
  284. ap_xml = ET.SubElement(xml, 'argument-pairs')
  285. i = 0
  286. for e1, e2, ed in self.WBDebG.edges(data = True):
  287. p = ed['data']
  288. p.set('entailment', 'ATTACKS')
  289. t = p.find('t')
  290. t.set('comment_id', t.get('id'))
  291. t.set('id', self.WBDebG.nodes[t.get('id')]['data'].get('id'))
  292. h = p.find('h')
  293. h.set('comment_id', h.get('id'))
  294. h.set('id', self.WBDebG.nodes[h.get('id')]['data'].get('id'))
  295. ap_xml.append(p)
  296. i = i + 1
  297. xml.set('num_edges', str(i))
  298. ET.ElementTree(xml).write("%s.wbg.xml" % args.input_file)
  299. def WBDebG_to_xml(self, args, WBDebG, tmp_file_name):
  300. '''
  301. Saves a WBDebG graph to a xml file, it uses the information generated in WBDebG2xml
  302. '''
  303. xml = ET.Element('entailment-corpus')
  304. xml.append(ET.Comment(args2str(args)))
  305. al_xml = ET.SubElement(xml, 'argument-list')
  306. maxw = minw = scale_weight(int(WBDebG.nodes[self.root_id]['data'].get('score')), args)
  307. for n_id, nd in WBDebG.nodes(data = True):
  308. a = nd['data']
  309. w = scale_weight(int(a.get('score')), args)
  310. WBDebG.nodes[n_id]['weight'] = w
  311. if w < minw:
  312. minw = w
  313. elif w > maxw:
  314. maxw = w
  315. al_xml.append(a)
  316. al_xml.set('minweight', str(minw))
  317. al_xml.set('maxweight', str(maxw))
  318. xml.set('num_nodes', str(WBDebG.number_of_nodes()))
  319. ap_xml = ET.SubElement(xml, 'argument-pairs')
  320. for e1, e2, ed in WBDebG.edges(data = True):
  321. p = ed['data']
  322. ap_xml.append(p)
  323. xml.set('num_edges', str(WBDebG.number_of_edges()))
  324. ET.ElementTree(xml).write('%s' % tmp_file_name)
  325. def draw_ccia18_PCT(self, args):
  326. '''
  327. Drawing Polarized Comment Tree
  328. '''
  329. print('Drawing Polarized Comment Tree...')
  330. gv = networkx.nx_agraph.to_agraph(self.PDebT)
  331. gv.node_attr['style'] = 'filled'
  332. gv.node_attr['fixedsize'] = 'true'
  333. gv.node_attr['width'] = '0.4'
  334. gv.node_attr['height'] = '0.4'
  335. for n in gv.nodes():
  336. node_id = int(self.PDebT.nodes[n]['data'].get('id'))
  337. n.attr['label'] = str(node_id)
  338. if self.PDebT.nodes[n]['sentiment'] == 1:
  339. fillcolor = self.get_node_color([0x80, 0xFF, 0x40], self.WBDebG.nodes[n]['weight'])
  340. else:
  341. fillcolor = self.get_node_color([0x7F, 0x00, 0x7F], self.WBDebG.nodes[n]['weight'])
  342. n.attr['fillcolor'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, fillcolor)])
  343. gv.layout(prog = 'dot', args='-Goverlap=false -Gnodesep=0.2 -Granksep=0.2 -Grankdir=BT -GK=800 -Gstart=17 -Gmaxiter=600')
  344. gv.draw("%s.pct.png" % args.input_file, format = 'png')
  345. def draw_ccia18_WBG(self, args):
  346. '''
  347. Drawing Weighted Biparite Graph
  348. '''
  349. # TODO: Grouping nodes
  350. # https://stackoverflow.com/questions/19674316/grouping-nodes-with-the-same-color-near-each-other-in-graphviz
  351. if self.VAF_accepted:
  352. print('Drawing Weighted Biparite Graph solution...')
  353. output_file_name = '%s.wbg-sol.png' % args.input_file
  354. else:
  355. print('Drawing Weighted Biparite Graph...')
  356. output_file_name = '%s.wbg.png' % args.input_file
  357. gv = networkx.nx_agraph.to_agraph(self.WBDebG)
  358. gv.node_attr['style'] = 'filled'
  359. gv.node_attr['fixedsize'] = 'true'
  360. gv.node_attr['width'] = '0.4'
  361. gv.node_attr['height'] = '0.4'
  362. #gv.edge_attr['color'] = '#FF8080'
  363. for n in gv.nodes():
  364. node_id = int(self.WBDebG.nodes[n]['data'].get('id'))
  365. n.attr['label'] = str(node_id)
  366. bordercolor = [0x00, 0x00, 0x00]
  367. penwidth = 1
  368. if self.WBDebG.nodes[n]['bipartite_set'] == 1:
  369. fillcolor = self.get_node_color([0x00, 0x00, 0xFF], self.WBDebG.nodes[n]['weight'])
  370. else:
  371. fillcolor = self.get_node_color([0xFF, 0x00, 0x00], self.WBDebG.nodes[n]['weight'])
  372. if self.VAF_accepted:
  373. if node_id not in self.VAF_accepted:
  374. bordercolor = fillcolor
  375. penwidth = 3
  376. fillcolor = self.get_node_color([0x00, 0x00, 0x00], self.WBDebG.nodes[n]['weight'])
  377. # fillcolor format '#RRGGBB', for example: fillcolor = '#FF8080'
  378. n.attr['fillcolor'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, fillcolor)])
  379. n.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, bordercolor)])
  380. n.attr['penwidth'] = penwidth
  381. #for e in gv.edges():
  382. # e.attr['color'] = '#FF8080'
  383. gv.layout(prog = 'dot', args='-Goverlap=false -Gnodesep=0.2 -Granksep=0.2 -Grankdir=BT -GK=800 -Gstart=17 -Gmaxiter=600')
  384. gv.draw(output_file_name, format = 'png')
  385. def ccia2018_stats_to_file(self, args):
  386. '''
  387. Write WBDebG statistics and information to file
  388. '''
  389. print('Writting statistics to file...')
  390. num_nodes = self.WBDebG.number_of_nodes()
  391. out_str = 'CCIA 2018 stats\n---------------\nInput file = %s\n' % args.input_file
  392. out_str += 'Number of nodes = %i\n' % num_nodes
  393. out_str += 'Number of edges = %i\n' % self.WBDebG.number_of_edges()
  394. out_str += 'PDebT maximum depth = %i\n\n' % self.PDebT.nodes[self.root_id]['max_depth']
  395. yes_fav = not_fav = 0 # Number of comments in favor and not in favor
  396. yes_fav_sol = not_fav_sol = 0 # Number of comments in favor and not in favor in the solution
  397. yes_fav_solw = not_fav_solw = 0 # Weight of comments in favor and not in favor in the solution
  398. for n, nd in self.WBDebG.nodes(data = True):
  399. node_id = int(self.WBDebG.nodes[n]['data'].get('id'))
  400. if nd['bipartite_set'] == 1:
  401. yes_fav += 1
  402. if node_id in self.VAF_accepted:
  403. yes_fav_sol += 1
  404. yes_fav_solw += nd['weight']
  405. else:
  406. not_fav += 1
  407. if node_id in self.VAF_accepted:
  408. not_fav_sol += 1
  409. not_fav_solw += nd['weight']
  410. out_str += '#Nodes in favor = %i (%.2f%% of total)\n' % (yes_fav, 100.0 * yes_fav / num_nodes)
  411. out_str += '#Nodes not in favor = %i (%.2f%% of total)\n\n' % (not_fav, 100.0 * not_fav / num_nodes)
  412. out_str += '#Nodes in favor in solution = %i (%.2f%% of in favor)\n' % (yes_fav_sol, 100.0 * yes_fav_sol / yes_fav)
  413. out_str += '#Nodes not in favor in solution = %i (%.2f%% of not in favor)\n' % (not_fav_sol, 100.0 * not_fav_sol / not_fav)
  414. out_str += 'Percentage difference = %0.2f%%\n\n' % abs(100.0 * yes_fav_sol / yes_fav - 100.0 * not_fav_sol / not_fav)
  415. out_str += 'Weight of solution, in favor - not in favor = %i - %i = %i\n' % (yes_fav_solw, not_fav_solw, yes_fav_solw - not_fav_solw)
  416. out_str += 'Weight of solution normalized [-1, 1] = %0.2f\n' % (float(yes_fav_solw - not_fav_solw) / float(yes_fav_solw + not_fav_solw))
  417. # More author based stats for CCIA 2019
  418. if args.ccia2019_order:
  419. authors = {}
  420. for n, nd in self.WBDebG.nodes(data = True):
  421. yes_fav_in_sol = 0
  422. not_fav_in_sol = 0
  423. a = self.WBDebG.nodes[n]['data'].get('author')
  424. if a not in authors:
  425. authors[a] = {}
  426. authors[a]['num_nodes'] = 0 # Number of nodes
  427. authors[a]['num_nodes_yes_fav'] = 0 # Number of nodes in favor of root
  428. authors[a]['num_nodes_not_fav'] = 0 # Number of nodes not in favor of root
  429. authors[a]['num_nodes_in_sol'] = 0 # Number of nodes in solution
  430. authors[a]['num_nodes_not_in_sol'] = 0 # Number of nodes not in solution
  431. authors[a]['num_nodes_yes_fav_in_sol'] = 0 # Number of nodes in favor of root and in solution
  432. authors[a]['num_nodes_not_fav_in_sol'] = 0 # Number of nodes not in favor of root and in solution
  433. authors[a]['num_in_edges'] = 0 # Replies to this node
  434. authors[a]['num_in_edges_pos'] = 0 # Replies to this node that support it
  435. authors[a]['num_in_edges_neg'] = 0 # Replies to this node that not support it
  436. authors[a]['num_init_nodes'] = 0 # Number of nodes before alpha prunning
  437. authors[a]['sum_sentiment_radicality'] = 0 # sum_{c \in Comments} abs(sentiment_rel(c))
  438. authors[a]['scores'] = [] # List of scores of the author's comments
  439. authors[a]['num_nodes'] += 1
  440. authors[a]['scores'].append(int(self.WBDebG.nodes[n]['data'].get('score')))
  441. if self.WBDebG.nodes[n]['bipartite_set'] == 1:
  442. authors[a]['num_nodes_yes_fav'] += 1
  443. yes_fav_in_sol += 1
  444. else:
  445. authors[a]['num_nodes_not_fav'] += 1
  446. not_fav_in_sol += 1
  447. node_id = int(self.WBDebG.nodes[n]['data'].get('id'))
  448. if node_id in self.VAF_accepted:
  449. authors[a]['num_nodes_in_sol'] += 1
  450. yes_fav_in_sol += 1
  451. not_fav_in_sol += 1
  452. else:
  453. authors[a]['num_nodes_not_in_sol'] += 1
  454. if yes_fav_in_sol == 2:
  455. authors[a]['num_nodes_yes_fav_in_sol'] += 1
  456. elif not_fav_in_sol == 2:
  457. authors[a]['num_nodes_not_fav_in_sol'] += 1
  458. authors[a]['num_in_edges'] += self.PDebT.in_degree(n)
  459. for e in self.PDebT.in_edges(n):
  460. if self.PDebT.nodes[e[0]]['bipartite_set'] == self.PDebT.nodes[e[1]]['bipartite_set']:
  461. authors[a]['num_in_edges_pos'] += 1
  462. else:
  463. authors[a]['num_in_edges_neg'] += 1
  464. for n, nd in self.DebT.nodes(data = True):
  465. a = self.DebT.nodes[n]['data'].get('author')
  466. if a in authors:
  467. authors[a]['num_init_nodes'] += 1 # Counting nodes removed by alpha prune
  468. authors[a]['sum_sentiment_radicality'] += abs(self.DebT.nodes[n]['sentiment_not_normalized'])
  469. out_str += self.get_stats_ccia2019(authors, args)
  470. # Write to file
  471. output_file_name = '%s.wbg-sol.info' % args.input_file
  472. output_file = open(output_file_name, 'w')
  473. output_file.write(out_str)
  474. output_file.close()
  475. def get_stats_ccia2019(self, authors, args):
  476. '''
  477. Get statistics based on the CCIA 2019 paper
  478. '''
  479. for a in authors:
  480. # Radicality
  481. authors[a]['radicality'] = float(authors[a]['sum_sentiment_radicality']) / authors[a]['num_init_nodes']
  482. # Attention generator
  483. authors[a]['attention_generator_pos'] = float(authors[a]['num_in_edges_pos']) / authors[a]['num_nodes']
  484. authors[a]['attention_generator_neg'] = float(authors[a]['num_in_edges_neg']) / authors[a]['num_nodes']
  485. # Author polarization --> [-1, 1]
  486. if (authors[a]['num_nodes_yes_fav_in_sol'] + authors[a]['num_nodes_not_fav_in_sol']) > 0:
  487. authors[a]['pol_sol'] = float(authors[a]['num_nodes_yes_fav_in_sol'] - authors[a]['num_nodes_not_fav_in_sol']) / (authors[a]['num_nodes_yes_fav_in_sol'] + authors[a]['num_nodes_not_fav_in_sol'])
  488. else:
  489. authors[a]['pol_sol'] = None
  490. authors[a]['pol'] = float(authors[a]['num_nodes_yes_fav'] - authors[a]['num_nodes_not_fav']) / (authors[a]['num_nodes_yes_fav'] + authors[a]['num_nodes_not_fav'])
  491. # max(|score|) for all author comments
  492. authors[a]['max_abs_score'] = max(map(abs, authors[a]['scores']))
  493. # sum(|score|) for all author comments
  494. authors[a]['sum_abs_score'] = sum(map(abs, authors[a]['scores']))
  495. # sum(score) for all author comments
  496. authors[a]['sum_score'] = sum(authors[a]['scores'])
  497. # number of author comments
  498. authors[a]['num_comments'] = len(authors[a]['scores'])
  499. out_str = 'CCIA 2019 stats\n---------------\n'
  500. for ordering in args.ccia2019_order:
  501. # Sort by authors relevance choice (ccia2019_order paramater)
  502. sorted_authors = sorted(authors.items(), key = lambda a: a[1][ordering], reverse = True)
  503. out_str += 'Number of authors: %i\n' % len(authors)
  504. out_str += 'Sorted by: %s\n' % ordering
  505. # Output top X authors data
  506. data = ['author', 'max_abs_score', 'sum_abs_score', 'sum_score', 'num_comments', 'radicality', 'att_gen_pos', 'att_gen_neg', 'polarization']
  507. out_str += format_data(data)
  508. for a in sorted_authors[:20]:
  509. data = [a[0], a[1]['max_abs_score'], a[1]['sum_abs_score'], a[1]['sum_score'], a[1]['num_comments'], a[1]['radicality'], a[1]['attention_generator_pos'], a[1]['attention_generator_neg'], a[1]['pol']]
  510. out_str += format_data(data)
  511. return out_str
  512. def prca2019_authors_relevance(self, args):
  513. '''
  514. Compute relevance(u) = sum_{c in Gamma | user(c) = u} W(score(c))
  515. '''
  516. print(' Computing authors relevance...')
  517. authors = {}
  518. for n, nd in self.PDebT.nodes(data = True):
  519. a = self.PDebT.nodes[n]['data'].get('author')
  520. if a not in authors:
  521. authors[a] = {}
  522. authors[a]['wscores'] = [] # List of scores of the author's comments
  523. authors[a]['wscores'].append(scale_weight(int(self.PDebT.nodes[n]['data'].get('score')), args))
  524. for a in authors:
  525. authors[a]['sum_wscore'] = sum(authors[a]['wscores'])
  526. return sorted(authors.items(), key = lambda a: a[1]['sum_wscore'], reverse = True)
  527. def prca2019_remove_author(self, author, G):
  528. '''
  529. Return a copy of WBDebG with author removed (RDebT, Restricted DebT)
  530. '''
  531. # Copy graph
  532. res_G = networkx.DiGraph()
  533. res_G.add_nodes_from(G.nodes(data = True))
  534. for e1, e2, ed in G.edges(data = True):
  535. res_G.add_edge(e1, e2, data = G[e1][e2]['data'])
  536. # Remove author
  537. rg = res_G.reverse() # Reverse graph (direction of edges reversed)
  538. for n, nd in res_G.nodes(data = True):
  539. if nd['data'].get('author') == author:
  540. if n in res_G and nd['data'].get('comment_id') != self.root_id: # Not already removed (appears in a previous subtree) and not root node
  541. # Get subtree of node n and remove it
  542. st = networkx.algorithms.traversal.depth_first_search.dfs_tree(rg, n)
  543. res_G.remove_nodes_from(st.nodes())
  544. return res_G
  545. def prca2019_get_stats(self, G):
  546. '''
  547. Get the stats needed of the authors' list from G
  548. '''
  549. G_stats = {}
  550. G_stats['num_comments'] = G.number_of_nodes() # Number of comments
  551. G_stats['list_comments_id'] = list(G.nodes()) # List of comments id (Reddit id)
  552. G_stats['list_nodes_id'] = map(lambda x: int(self.comment_id[x]), G_stats['list_comments_id']) # List of nodes id [0, ...] (0 = root node)
  553. G_stats['list_nodes_id_Cplus'] = [int(self.comment_id[n]) for n, nd in G.nodes(data = True) if nd['bipartite_set'] == 1] # List of nodes id in favor of root node
  554. G_stats['list_nodes_id_Cminus'] = [int(self.comment_id[n]) for n, nd in G.nodes(data = True) if nd['bipartite_set'] == -1] # List of nodes id NOT in favor of root node
  555. return G_stats
  556. def prca2019_analysis(self, args):
  557. '''
  558. Perform PRL VSI PR&CA 2019 analysis
  559. '''
  560. print('Performing VSI PR&CA 2019 analysis...')
  561. tmp_file_name = '/tmp/tmp-reddit-at-WBDebG.tmp.xml'
  562. sorted_authors = self.prca2019_authors_relevance(args) # Most relevant authors
  563. PDebT_stats = self.prca2019_get_stats(self.PDebT)
  564. res_PDebT_stats = {}
  565. res_VAF_accepted = {}
  566. ai = 0
  567. for a, ad in sorted_authors[:args.prca2019]:
  568. print(' Analysing author "%s" (%i/%i)...' % (a, ai + 1, min(args.prca2019, len(sorted_authors))))
  569. res_PDebT = self.prca2019_remove_author(a, self.PDebT)
  570. res_PDebT_stats[a] = self.prca2019_get_stats(res_PDebT)
  571. res_WBDebG = self.PDebG_to_WBDebG(res_PDebT)
  572. self.WBDebG_to_xml(args, res_WBDebG, tmp_file_name)
  573. res_VAF_accepted[a] = VAF_solver(args, tmp_file_name)
  574. ai = ai + 1
  575. self.prca2019_stats_to_file(sorted_authors, PDebT_stats, res_PDebT_stats, res_VAF_accepted, args)
  576. def prca2019_stats_to_file(self, sorted_authors, PDebT_stats, res_PDebT_stats, res_VAF_accepted, args):
  577. '''
  578. Compute PRL VSI PR&CA 2019 stats and outputs them to file
  579. '''
  580. output_file_name = '%s.prca2019.info' % args.input_file
  581. print('Writting statistics to file "%s"...' % output_file_name)
  582. out_str = 'Input file: %s\n' % args.input_file
  583. out_str += 'Number of authors: %i\n' % len(sorted_authors)
  584. out_str += 'Number of comments: %i\n' % PDebT_stats['num_comments']
  585. data = ['author', '#comments', 'relevance', 'engaging', 'influence', 'rebalancing', 'rebalancing2']
  586. out_str += format_data(data)
  587. # Data from initial graphs and solution with all users
  588. Ca = frozenset(PDebT_stats['list_nodes_id'])
  589. Cplus = frozenset(PDebT_stats['list_nodes_id_Cplus'])
  590. Cminus = frozenset(PDebT_stats['list_nodes_id_Cminus'])
  591. S = frozenset(self.VAF_accepted)
  592. polS = (len(S & Cplus) - len(S & Cminus)) / float(len(S))
  593. lengaging = []
  594. linfluence = []
  595. lrebalancing = []
  596. lrebalancing2 = []
  597. for a, ad in sorted_authors[:args.prca2019]:
  598. # Data of the restricted (without user) graphs and solution
  599. Cau = frozenset(res_PDebT_stats[a]['list_nodes_id'])
  600. Cplusu = frozenset(res_PDebT_stats[a]['list_nodes_id_Cplus'])
  601. Cminusu = frozenset(res_PDebT_stats[a]['list_nodes_id_Cminus'])
  602. Su = frozenset(res_VAF_accepted[a])
  603. polSu = (len(Su & Cplusu) - len(Su & Cminusu)) / float(len(Su))
  604. polSCau = (len(S & Cplus & Cau) - len(S & Cminus & Cau)) / float(len(S & Cau))
  605. # engaging(u) = #(Ca \ Cau) / #Ca, conversation remaining after removing user
  606. engaging = len(Ca - Cau) / float(len(Ca))
  607. lengaging.append(engaging)
  608. # influence(u) = (#((Cau \ Su) \cap S) + #(Su \cap (Ca \ S))) / #Cau, (u moved to S + u moved outside S) / # u comments
  609. influence = (len((Cau - Su) & S) + len(Su & (Ca - S))) / float(len(Cau))
  610. linfluence.append(influence)
  611. # rebalancing(u) = |polarization(S) - polarization(Su)|, polarization(S) = (#(S \cap C+) - #(S \cap C-)) / #S, absolute change in polarization after removing user
  612. rebalancing = abs(polS - polSu)
  613. lrebalancing.append(rebalancing)
  614. # rebalancing2(u) = |polarization(S \cap Cu) - polarization(Su)|, polarization(S) = (#(S \cap C+) - #(S \cap C-)) / #S, absolute change in polarization after removing user
  615. rebalancing2 = abs(polSCau - polSu)
  616. lrebalancing2.append(rebalancing2)
  617. # Add row to output string
  618. data = [a, len(ad['wscores']), ad['sum_wscore'], engaging, influence, rebalancing, rebalancing2]
  619. out_str += format_data(data)
  620. data = ['Mean', '', '', sum(lengaging) / len(lengaging), sum(linfluence) / len(linfluence), sum(lrebalancing) / len(lrebalancing), sum(lrebalancing2) / len(lrebalancing2)]
  621. out_str += format_data(data)
  622. # Write to file
  623. output_file = open(output_file_name, 'w')
  624. output_file.write(out_str)
  625. output_file.close()
  626. # Functions
  627. def args2str(args):
  628. argstr = '=== Begin arguments ===\n'
  629. argstr += 'Input file: %s\n' % args.input_file
  630. argstr += 'Alpha parameter: %f\n' % args.alpha
  631. argstr += 'Algorithm: %s\n' % args.algorithm
  632. argstr += 'Log scale base: %i\n' % args.log_base
  633. argstr += 'Socialarg git path: %s\n' % args.socialarg_path
  634. argstr += 'Spark path: %s\n' % args.spark_path
  635. argstr += 'Draw graphs: %s\n' % args.draw_graphs
  636. argstr += 'Neutral comments: %s\n' % args.neutral_comments
  637. argstr += 'CCIA2019 author ordering: %s\n' % args.ccia2019_order
  638. argstr += 'VSI PR&CA 2019 analysis: %i\n' % args.prca2019
  639. argstr += 'User-based analysis: %s\n' % args.user
  640. argstr += 'User valuation: %s\n' % args.user_valuation
  641. argstr += 'SCIP output: %s\n' % args.scip_output
  642. argstr += 'Parameters: %s\n' % args.params
  643. argstr += 'Random seed: %s\n' % args.seed
  644. argstr += '=== End arguments ==='
  645. return argstr
  646. def scale_weight(weight, args):
  647. '''Scales the weight using a log function'''
  648. if weight >= 1:
  649. return int(math.floor(math.log(weight, args.log_base)) + 1)
  650. else:
  651. return 0
  652. def VAF_solver(args, input_file_name = None):
  653. '''
  654. Solves the discusion using the VAF solver and return the accepte nodes
  655. '''
  656. if input_file_name:
  657. print(' Solving graph with VAF solver...')
  658. input_file_name = input_file_name[:-4]
  659. else:
  660. print('Solving graph with VAF solver...')
  661. input_file_name = '%s.wbg' % args.input_file
  662. output_file_name = '/tmp/tmp-reddit-at.out'
  663. # Check files
  664. if os.path.isdir(os.path.expanduser(args.spark_path)):
  665. args.spark_path = os.path.abspath(os.path.expanduser(args.spark_path))
  666. spark_submit = '%s/bin/spark-submit' % args.spark_path
  667. if not os.path.exists(spark_submit):
  668. sys.exit('ERROR: spark-submit not found at "%s".' % spark_submit)
  669. else:
  670. sys.exit('ERROR: Spark folder not found "%s".' % args.spark_path)
  671. if os.path.isdir(os.path.expanduser(args.socialarg_path)):
  672. args.socialarg_path = os.path.abspath(os.path.expanduser(args.socialarg_path))
  673. #analyzer_jar = '%s/distributed_social_network_analyzer/target/scala-2.11/social-network-analyzer_2.11-1.0.jar' % args.socialarg_path
  674. analyzer_jar = '%s/distributed_social_network_analyzer/target/scala-2.12/social-network-analyzer_2.12-1.0.jar' % args.socialarg_path
  675. if not os.path.exists(analyzer_jar):
  676. sys.exit('ERROR: analyzer jar file not found at "%s".' % analyzer_jar)
  677. else:
  678. sys.exit('ERROR: socialarg git repo folder not found "%s".' % args.socialarg_path)
  679. # Run solver
  680. cmd = '(time %s --master local[4] --class "MainAppFromXML" %s %s) &> %s' % (spark_submit, analyzer_jar, input_file_name, output_file_name)
  681. os.system(cmd)
  682. # Parse output
  683. accepted = []
  684. try:
  685. xml = ET.parse('%s-xml.sol' % input_file_name)
  686. answer = xml.find('answer')
  687. for a in answer.iter('arg'):
  688. accepted.append(int(a.attrib['id']))
  689. except:
  690. sys.exit('ERROR: something happened while parsing solver output "%s-xml.sol".' % input_file_name)
  691. return accepted
  692. def format_data(data):
  693. '''
  694. Format data list for an output of fixed column width
  695. '''
  696. width = [20, 9, 9, 9, 9, 11, 12] + [12] * len(data)
  697. sep = '|'
  698. eol = '\n'
  699. out_str = ''
  700. while data:
  701. w = width.pop(0)
  702. d = data.pop(0)
  703. if isinstance(d, float) and d != 0:
  704. data_str = '{:0.10f}'.format(d)
  705. else:
  706. data_str = str(d)
  707. out_str += data_str[:w].ljust(w)
  708. if data:
  709. out_str += sep
  710. out_str += eol
  711. return out_str
  712. # Main
  713. if __name__ == '__main__' :
  714. # Parse arguments
  715. parser = argparse.ArgumentParser(description = 'Reddit Analysis Tool.')
  716. # Optional arguments
  717. parser.add_argument('-a', '--alpha', default = 0.5, type = float, help = 'Alpha parameter used as threshold for several functions (default: 0.5)', dest = 'alpha')
  718. parser.add_argument('-al', '--algorithm', type = str, default = 'g0', help = 'Algorithm and parameters in case available, see docstring for more information (default: g0)', dest = 'algorithm')
  719. parser.add_argument('--ccia2019_order', nargs = '+', type = str, choices = ['max_abs_score', 'sum_abs_score', 'sum_score', 'num_comments'], help = 'Author ordering for CCIA 2019 stats (default: max_abs_score)', dest = 'ccia2019_order')
  720. parser.add_argument('-d', '--draw_graphs', action = 'store_true', default = False, help = 'Draws the grafs of all the steps of the analysis (default: False)', dest = 'draw_graphs')
  721. parser.add_argument('-if', '--input_file', default = None, type = str, help = 'Input file name of the xml with the Reddit post information', dest = 'input_file')
  722. parser.add_argument('-lb', '--log_base', default = 10, type = int, help = 'Logarithmic scale base for weighting (default: 10)', dest = 'log_base')
  723. parser.add_argument('-nc', '--neutral_comments', nargs = '?', type = str, default = 'do_nothing', choices = ['do_nothing', 'remove_subtree', 'to_positive'], help = 'Neutral comments treatment (default: do_nothing)', dest = 'neutral_comments')
  724. parser.add_argument('-p', '--params', default = None, type = str, help = 'Argument used to specify parameters for some functionalities', dest = 'params')
  725. parser.add_argument('--prca2019', default = 0, type = int, help = 'PRL VSI in PR&CA 2019 analysis (default: 0)', dest = 'prca2019')
  726. parser.add_argument('-s', '--seed', default = None, type = str, help = 'Seed to initialize random numbers (default: None)', dest = 'seed')
  727. parser.add_argument('-so', '--scip_output', action = 'store_true', default = False, help = 'Outputs UDebG files in SCIP format to solve bipartition problem (default: False)', dest = 'scip_output')
  728. parser.add_argument('--socialarg_path', default = '~/git/socialarg', type = str, help = 'Path to the socialarg git repo (default: ~/git/socialarg)', dest = 'socialarg_path')
  729. parser.add_argument('-sp', '--spark_path', default = '~/.local/spark-2.2.1-bin-hadoop2.7', type = str, help = 'Spark path (default: ~/.local/spark-2.2.1-bin-hadoop2.7)', dest = 'spark_path')
  730. parser.add_argument('-u', '--user', default = None, type = str, choices = ['mdai2020', 'wia2021'], help = 'User-based analysis (default: None)', dest = 'user')
  731. parser.add_argument('-uv', '--user_valuation', default = None, type = str, choices = ['comment_karma', 'sum_scores'], help = 'User valuation for a VAF over UDebG (default: None)', dest = 'user_valuation')
  732. args = parser.parse_args()
  733. print(args2str(args))
  734. # No input file defined
  735. if not args.input_file:
  736. # Generate random UDebG
  737. udebg.UDebG(None, None, args)
  738. exit()
  739. # Read debate and create inital Graph
  740. rg = RedditG(args.input_file)
  741. # User-oriented analysis
  742. if args.user:
  743. if args.user == 'wia2021':
  744. rg.wia2021_DebT(args)
  745. rg.wia2021_SDebT(args)
  746. udebg.UDebG(rg.SDebT, rg.root_id, args)
  747. exit()
  748. # Perform analysis (WBDebG)
  749. rg.ccia18_analysis(args)
  750. # Output results
  751. rg.WBDebG2xml(args)
  752. if args.draw_graphs:
  753. rg.draw_ccia18_PCT(args)
  754. rg.draw_ccia18_WBG(args)
  755. # Compute solution using VAF solver
  756. rg.VAF_accepted = VAF_solver(args)
  757. # Output results with solution
  758. if args.draw_graphs:
  759. rg.draw_ccia18_WBG(args)
  760. # Compute stats
  761. if args.prca2019:
  762. rg.prca2019_analysis(args)
  763. else:
  764. rg.ccia2018_stats_to_file(args)

Powered by TurnKey Linux.