#!/usr/bin/python3 -ttOO ''' Artificial Intelligence Research Group University of Lleida ''' # Libraries import os import sys import argparse import xml.etree.ElementTree as ET import math import networkx import ast import udebg # Classes class RedditG(): ''' Reddit post in graph structure for its analysis ''' def __init__(self, input_file): self.xml = None self.root_id = None self.DebT = networkx.DiGraph() # Debate Tree self.PDebT = networkx.DiGraph() # Pruned Debate Tree self.SDebT = networkx.DiGraph() # Two-Sided Debate Tree self.WBDebG = None # Weighted Bipartite Debate Graph self.comment_id = {} self.min_weight = None self.max_weight = None self.VAF_accepted = None self.read_xml(input_file) def remove_deleted_comments(self): rg = self.DebT.reverse() # Reverse graph (direction of edges reversed) while True: # To avoid "RuntimeError: dictionary changed size during iteration" in inner loop changed = False for n, nd in self.DebT.nodes(data = True): if nd['data'].get('author') == 'None': if n in self.DebT: # Not already removed (appears in a previous subtree) # Get subtree of node n and remove it st = networkx.algorithms.traversal.depth_first_search.dfs_tree(rg, n) self.DebT.remove_nodes_from(st.nodes()) changed = True break if not changed: break def read_xml(self, input_file): ''' Read XML file with the conversation ''' print('Reading xml input file...') self.xml = ET.parse(input_file) al = self.xml.find('argument-list') for arg in al.iter('arg'): if 'title' in arg.attrib: self.root_id = arg.attrib['id'] if arg.attrib['author'] == 'None': # To not delete full tree when root node author is deleted arg.attrib['author'] = 'root_node_author' self.DebT.add_node(arg.attrib['id'], data = arg) ap = self.xml.find('argument-pairs') for pair in ap.iter('pair'): # Argument pair (relation) t replies to h self.DebT.add_edge(pair.find('t').get('id'), pair.find('h').get('id'), data = pair) self.remove_deleted_comments() def wia2021_DebT(self, args): ''' DebT for wia2021 ''' print('Generating DebT for wia2021...') # Set chronological id to comments in DebT id_list = sorted([n for n, nd in self.DebT.nodes(data = True) if 'title' not in nd['data'].attrib]) for i, c_id in enumerate(id_list): self.DebT.nodes[c_id]['chrono_id'] = i + 1 # chrono_id for root node set below print(' Number of nodes DebT = {}'.format(self.DebT.number_of_nodes())) # Initializations self.DebT.nodes[self.root_id]['sentiment_not_normalized'] = 0 self.DebT.nodes[self.root_id]['chrono_id'] = 0 # BFS on DebT to compute sentiment not normalized [-2, 2] list_edges = [e for e in self.DebT.in_edges(self.root_id)] while list_edges: current_edge = list_edges.pop(0) node_id = current_edge[0] sentiment = self.ccia18_sentiment(False, ast.literal_eval(self.DebT.nodes[node_id]['data'].get('sentiment_distribution')), args) self.DebT.nodes[node_id]['sentiment_not_normalized'] = sentiment list_edges.extend([e for e in self.DebT.in_edges(node_id)]) if args.draw_graphs: self.wia2021_draw_DebT(args) def wia2021_SDebT(self, args): ''' SDebT for wia2021 ''' print('Generating SDebT for wia2021...') # Copy DebT to SDebT self.SDebT.add_nodes_from(self.DebT.nodes(data = True)) for e1, e2, ed in self.DebT.edges(data = True): self.SDebT.add_edge(e1, e2, data = self.DebT[e1][e2]['data']) # Initializations self.SDebT.nodes[self.root_id]['side'] = 1 # BFS to compute the side of each node list_edges = [e for e in self.SDebT.in_edges(self.root_id)] while list_edges: current_edge = list_edges.pop(0) node_id = current_edge[0] parent_node_id = current_edge[1] if (self.SDebT.nodes[parent_node_id]['side'] == 1 and self.DebT.nodes[node_id]['sentiment_not_normalized'] > 0) or (self.SDebT.nodes[parent_node_id]['side'] == -1 and self.DebT.nodes[node_id]['sentiment_not_normalized'] <= 0): self.SDebT.nodes[node_id]['side'] = 1 else: self.SDebT.nodes[node_id]['side'] = -1 list_edges.extend([e for e in self.SDebT.in_edges(node_id)]) if args.draw_graphs: self.wia2021_draw_SDebT(args) def wia2021_draw_DebT(self, args): ''' Drawing wia2021 DebT ''' print('Drawing wia2021 DebT...') gv = networkx.nx_agraph.to_agraph(self.DebT) gv.node_attr['style'] = 'filled' gv.node_attr['fixedsize'] = 'true' gv.node_attr['width'] = '0.4' gv.node_attr['height'] = '0.4' gv.node_attr['fillcolor'] = '#0000FF' gv.node_attr['fontcolor'] = '#FFFFFF' for n in gv.nodes(): n.attr['label'] = str(self.DebT.nodes[n]['chrono_id']) gv.edge_attr['color'] = '#000000' for e in gv.edges(): s = self.DebT.nodes[e[0]]['sentiment_not_normalized'] if s > 0: contrast, color = udebg.get_weighted_color([0x00, 0xFF, 0x00], 0, 2, s) e.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, color)]) elif s < 0: contrast, color = udebg.get_weighted_color([0xFF, 0x00, 0x00], 0, 2, -s) e.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, color)]) gv.layout(prog = 'dot', args='-Goverlap=false -Gnodesep=0.2 -Granksep=0.2 -Grankdir=BT -GK=800 -Gstart=17 -Gmaxiter=600') gv.draw("%s.debt.png" % args.input_file, format = 'png') def wia2021_draw_SDebT(self, args): ''' Drawing wia2021 SDebT ''' print('Drawing wia2021 SDebT...') gv = networkx.nx_agraph.to_agraph(self.SDebT) gv.node_attr['style'] = 'filled' gv.node_attr['fixedsize'] = 'true' gv.node_attr['width'] = '0.4' gv.node_attr['height'] = '0.4' gv.node_attr['fillcolor'] = '#0000FF' gv.node_attr['fontcolor'] = '#FFFFFF' for n in gv.nodes(): n.attr['label'] = str(self.SDebT.nodes[n]['chrono_id']) side = self.SDebT.nodes[n]['side'] if side == 1: n.attr['fontcolor'] = '#000000' n.attr['fillcolor'] = '#4FCFFF' # light green = '#6FFF6F', cyan = '#4FCFFF' else: n.attr['fontcolor'] = '#FFFFFF' n.attr['fillcolor'] = '#00007F' # light red = '#FF6F6F', dark blue = '#00007F' gv.edge_attr['color'] = '#000000' for e in gv.edges(): s = self.SDebT.nodes[e[0]]['sentiment_not_normalized'] if s > 0: contrast, color = udebg.get_weighted_color([0x00, 0xFF, 0x00], 0, 2, s) e.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, color)]) elif s < 0: contrast, color = udebg.get_weighted_color([0xFF, 0x00, 0x00], 0, 2, -s) e.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, color)]) gv.layout(prog = 'dot', args='-Goverlap=false -Gnodesep=0.2 -Granksep=0.2 -Grankdir=BT -GK=800 -Gstart=17 -Gmaxiter=600') gv.draw("%s.sdebt.png" % args.input_file, format = 'png') def ccia18_sentiment(self, normalized, sentiment_distribution, args): ''' Computes the sentiment from a sentiment distribution of 5 values Normalized: [very neg, neg, neutral, pos, very pos] --> [neg = -1, neutral = 0, pos = 1] Not normalized: --> [-2, 2] ''' sentiment_relevance = [-2, -1, 0, 1, 2] res = [a * b for a, b in zip(sentiment_relevance, sentiment_distribution)] res = sum(res) if not normalized: return res if abs(res) > args.alpha: if res > 0: return 1 # Positive else: return -1 # Negative else: return 0 # Neutral def get_node_color(self, base_color, w): hw = 0xCF if w >= self.max_weight: hw = 0 elif self.max_weight > self.min_weight: hw = int(hw * (float(self.max_weight - w) / float(self.max_weight - self.min_weight))) color = [a | b for a, b in zip(base_color, [hw, hw, hw])] return color def ccia18_analysis(self, args): ''' Weighted Bipartite Graph analysis ''' print('Generating PDebT and WBDebG...') # Copy DebT to PDebT self.PDebT.add_nodes_from(self.DebT.nodes(data = True)) for e1, e2, ed in self.DebT.edges(data = True): self.PDebT.add_edge(e1, e2, data = self.DebT[e1][e2]['data']) # Initializations self.PDebT.nodes[self.root_id]['bipartite_set'] = 1 # 1 in favor of root, -1 not in favor self.PDebT.nodes[self.root_id]['sentiment'] = 1 self.DebT.nodes[self.root_id]['sentiment_not_normalized'] = 0 rg = self.PDebT.reverse() # Reverse graph (direction of edges reversed) # DFS on PDebT before removing nodes to save DebT sentiment not normalized list_edges = [e for e in self.PDebT.in_edges(self.root_id)] self.PDebT.nodes[self.root_id]['depth'] = 1 max_depth = 1 while list_edges: current_edge = list_edges.pop() node_id = current_edge[0] self.PDebT.nodes[node_id]['depth'] = self.PDebT.nodes[current_edge[1]]['depth'] + 1 if self.PDebT.nodes[node_id]['depth'] > max_depth: max_depth = self.PDebT.nodes[node_id]['depth'] sentiment = self.ccia18_sentiment(False, ast.literal_eval(self.PDebT.nodes[node_id]['data'].get('sentiment_distribution')), args) self.DebT.nodes[node_id]['sentiment_not_normalized'] = sentiment list_edges.extend([e for e in self.PDebT.in_edges(node_id)]) self.PDebT.nodes[self.root_id]['max_depth'] = max_depth # DFS and prune PDebT list_edges = [e for e in self.PDebT.in_edges(self.root_id)] while list_edges: current_edge = list_edges.pop() node_id = current_edge[0] father_id = current_edge[1] sentiment = self.ccia18_sentiment(True, ast.literal_eval(self.PDebT.nodes[node_id]['data'].get('sentiment_distribution')), args) if sentiment == 1: # Positive self.PDebT.nodes[node_id]['bipartite_set'] = self.PDebT.nodes[father_id]['bipartite_set'] elif sentiment == -1: # Negative self.PDebT.nodes[node_id]['bipartite_set'] = -self.PDebT.nodes[father_id]['bipartite_set'] if sentiment == 0: # Neutral: remove subtree st = networkx.algorithms.traversal.depth_first_search.dfs_tree(rg, node_id) self.PDebT.remove_nodes_from(st.nodes()) else: # Not Neutral self.PDebT.nodes[node_id]['sentiment'] = sentiment list_edges.extend([e for e in self.PDebT.in_edges(node_id)]) # Create the WBDebG self.WBDebG = self.PDebG_to_WBDebG(self.PDebT) def PDebG_to_WBDebG(self, PDebT): ''' Create the WBDebG from the PDebT ''' WBDebG = networkx.DiGraph() WBDebG.add_nodes_from(PDebT.nodes(data = True)) for e1, e2, ed in PDebT.edges(data = True): if WBDebG.nodes[e1]['bipartite_set'] != WBDebG.nodes[e2]['bipartite_set']: WBDebG.add_edge(e1, e2, data = PDebT[e1][e2]['data']) return WBDebG def WBDebG2xml(self, args): ''' Saves self.WBDebG graph to xml file ''' xml = ET.Element('entailment-corpus') xml.append(ET.Comment(args2str(args))) al_xml = ET.SubElement(xml, 'argument-list') i = 1 # 0 for root maxw = minw = scale_weight(int(self.WBDebG.nodes[self.root_id]['data'].get('score')), args) for n_id, nd in self.WBDebG.nodes(data = True): a = nd['data'] w = scale_weight(int(a.get('score')), args) self.WBDebG.nodes[n_id]['weight'] = w if w < minw: minw = w elif w > maxw: maxw = w a.set('weight', str(w)) a.set('bipartite_set', str(self.WBDebG.nodes[n_id]['bipartite_set'])) a.set('comment_id', a.get('id')) if a.get('id') == self.root_id: # Id 0 for root node self.comment_id[a.get('id')] = '0' a.set('id', '0') else: self.comment_id[a.get('id')] = str(i) a.set('id', str(i)) i = i + 1 al_xml.append(a) al_xml.set('minweight', str(minw)) al_xml.set('maxweight', str(maxw)) self.min_weight = minw self.max_weight = maxw xml.set('num_nodes', str(i)) ap_xml = ET.SubElement(xml, 'argument-pairs') i = 0 for e1, e2, ed in self.WBDebG.edges(data = True): p = ed['data'] p.set('entailment', 'ATTACKS') t = p.find('t') t.set('comment_id', t.get('id')) t.set('id', self.WBDebG.nodes[t.get('id')]['data'].get('id')) h = p.find('h') h.set('comment_id', h.get('id')) h.set('id', self.WBDebG.nodes[h.get('id')]['data'].get('id')) ap_xml.append(p) i = i + 1 xml.set('num_edges', str(i)) ET.ElementTree(xml).write("%s.wbg.xml" % args.input_file) def WBDebG_to_xml(self, args, WBDebG, tmp_file_name): ''' Saves a WBDebG graph to a xml file, it uses the information generated in WBDebG2xml ''' xml = ET.Element('entailment-corpus') xml.append(ET.Comment(args2str(args))) al_xml = ET.SubElement(xml, 'argument-list') maxw = minw = scale_weight(int(WBDebG.nodes[self.root_id]['data'].get('score')), args) for n_id, nd in WBDebG.nodes(data = True): a = nd['data'] w = scale_weight(int(a.get('score')), args) WBDebG.nodes[n_id]['weight'] = w if w < minw: minw = w elif w > maxw: maxw = w al_xml.append(a) al_xml.set('minweight', str(minw)) al_xml.set('maxweight', str(maxw)) xml.set('num_nodes', str(WBDebG.number_of_nodes())) ap_xml = ET.SubElement(xml, 'argument-pairs') for e1, e2, ed in WBDebG.edges(data = True): p = ed['data'] ap_xml.append(p) xml.set('num_edges', str(WBDebG.number_of_edges())) ET.ElementTree(xml).write('%s' % tmp_file_name) def draw_ccia18_PCT(self, args): ''' Drawing Polarized Comment Tree ''' print('Drawing Polarized Comment Tree...') gv = networkx.nx_agraph.to_agraph(self.PDebT) gv.node_attr['style'] = 'filled' gv.node_attr['fixedsize'] = 'true' gv.node_attr['width'] = '0.4' gv.node_attr['height'] = '0.4' for n in gv.nodes(): node_id = int(self.PDebT.nodes[n]['data'].get('id')) n.attr['label'] = str(node_id) if self.PDebT.nodes[n]['sentiment'] == 1: fillcolor = self.get_node_color([0x80, 0xFF, 0x40], self.WBDebG.nodes[n]['weight']) else: fillcolor = self.get_node_color([0x7F, 0x00, 0x7F], self.WBDebG.nodes[n]['weight']) n.attr['fillcolor'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, fillcolor)]) gv.layout(prog = 'dot', args='-Goverlap=false -Gnodesep=0.2 -Granksep=0.2 -Grankdir=BT -GK=800 -Gstart=17 -Gmaxiter=600') gv.draw("%s.pct.png" % args.input_file, format = 'png') def draw_ccia18_WBG(self, args): ''' Drawing Weighted Biparite Graph ''' # TODO: Grouping nodes # https://stackoverflow.com/questions/19674316/grouping-nodes-with-the-same-color-near-each-other-in-graphviz if self.VAF_accepted: print('Drawing Weighted Biparite Graph solution...') output_file_name = '%s.wbg-sol.png' % args.input_file else: print('Drawing Weighted Biparite Graph...') output_file_name = '%s.wbg.png' % args.input_file gv = networkx.nx_agraph.to_agraph(self.WBDebG) gv.node_attr['style'] = 'filled' gv.node_attr['fixedsize'] = 'true' gv.node_attr['width'] = '0.4' gv.node_attr['height'] = '0.4' #gv.edge_attr['color'] = '#FF8080' for n in gv.nodes(): node_id = int(self.WBDebG.nodes[n]['data'].get('id')) n.attr['label'] = str(node_id) bordercolor = [0x00, 0x00, 0x00] penwidth = 1 if self.WBDebG.nodes[n]['bipartite_set'] == 1: fillcolor = self.get_node_color([0x00, 0x00, 0xFF], self.WBDebG.nodes[n]['weight']) else: fillcolor = self.get_node_color([0xFF, 0x00, 0x00], self.WBDebG.nodes[n]['weight']) if self.VAF_accepted: if node_id not in self.VAF_accepted: bordercolor = fillcolor penwidth = 3 fillcolor = self.get_node_color([0x00, 0x00, 0x00], self.WBDebG.nodes[n]['weight']) # fillcolor format '#RRGGBB', for example: fillcolor = '#FF8080' n.attr['fillcolor'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, fillcolor)]) n.attr['color'] = '#%s' % ''.join([c[2:].zfill(2) for c in map(hex, bordercolor)]) n.attr['penwidth'] = penwidth #for e in gv.edges(): # e.attr['color'] = '#FF8080' gv.layout(prog = 'dot', args='-Goverlap=false -Gnodesep=0.2 -Granksep=0.2 -Grankdir=BT -GK=800 -Gstart=17 -Gmaxiter=600') gv.draw(output_file_name, format = 'png') def ccia2018_stats_to_file(self, args): ''' Write WBDebG statistics and information to file ''' print('Writting statistics to file...') num_nodes = self.WBDebG.number_of_nodes() out_str = 'CCIA 2018 stats\n---------------\nInput file = %s\n' % args.input_file out_str += 'Number of nodes = %i\n' % num_nodes out_str += 'Number of edges = %i\n' % self.WBDebG.number_of_edges() out_str += 'PDebT maximum depth = %i\n\n' % self.PDebT.nodes[self.root_id]['max_depth'] yes_fav = not_fav = 0 # Number of comments in favor and not in favor yes_fav_sol = not_fav_sol = 0 # Number of comments in favor and not in favor in the solution yes_fav_solw = not_fav_solw = 0 # Weight of comments in favor and not in favor in the solution for n, nd in self.WBDebG.nodes(data = True): node_id = int(self.WBDebG.nodes[n]['data'].get('id')) if nd['bipartite_set'] == 1: yes_fav += 1 if node_id in self.VAF_accepted: yes_fav_sol += 1 yes_fav_solw += nd['weight'] else: not_fav += 1 if node_id in self.VAF_accepted: not_fav_sol += 1 not_fav_solw += nd['weight'] out_str += '#Nodes in favor = %i (%.2f%% of total)\n' % (yes_fav, 100.0 * yes_fav / num_nodes) out_str += '#Nodes not in favor = %i (%.2f%% of total)\n\n' % (not_fav, 100.0 * not_fav / num_nodes) out_str += '#Nodes in favor in solution = %i (%.2f%% of in favor)\n' % (yes_fav_sol, 100.0 * yes_fav_sol / yes_fav) out_str += '#Nodes not in favor in solution = %i (%.2f%% of not in favor)\n' % (not_fav_sol, 100.0 * not_fav_sol / not_fav) out_str += 'Percentage difference = %0.2f%%\n\n' % abs(100.0 * yes_fav_sol / yes_fav - 100.0 * not_fav_sol / not_fav) out_str += 'Weight of solution, in favor - not in favor = %i - %i = %i\n' % (yes_fav_solw, not_fav_solw, yes_fav_solw - not_fav_solw) out_str += 'Weight of solution normalized [-1, 1] = %0.2f\n' % (float(yes_fav_solw - not_fav_solw) / float(yes_fav_solw + not_fav_solw)) # More author based stats for CCIA 2019 if args.ccia2019_order: authors = {} for n, nd in self.WBDebG.nodes(data = True): yes_fav_in_sol = 0 not_fav_in_sol = 0 a = self.WBDebG.nodes[n]['data'].get('author') if a not in authors: authors[a] = {} authors[a]['num_nodes'] = 0 # Number of nodes authors[a]['num_nodes_yes_fav'] = 0 # Number of nodes in favor of root authors[a]['num_nodes_not_fav'] = 0 # Number of nodes not in favor of root authors[a]['num_nodes_in_sol'] = 0 # Number of nodes in solution authors[a]['num_nodes_not_in_sol'] = 0 # Number of nodes not in solution authors[a]['num_nodes_yes_fav_in_sol'] = 0 # Number of nodes in favor of root and in solution authors[a]['num_nodes_not_fav_in_sol'] = 0 # Number of nodes not in favor of root and in solution authors[a]['num_in_edges'] = 0 # Replies to this node authors[a]['num_in_edges_pos'] = 0 # Replies to this node that support it authors[a]['num_in_edges_neg'] = 0 # Replies to this node that not support it authors[a]['num_init_nodes'] = 0 # Number of nodes before alpha prunning authors[a]['sum_sentiment_radicality'] = 0 # sum_{c \in Comments} abs(sentiment_rel(c)) authors[a]['scores'] = [] # List of scores of the author's comments authors[a]['num_nodes'] += 1 authors[a]['scores'].append(int(self.WBDebG.nodes[n]['data'].get('score'))) if self.WBDebG.nodes[n]['bipartite_set'] == 1: authors[a]['num_nodes_yes_fav'] += 1 yes_fav_in_sol += 1 else: authors[a]['num_nodes_not_fav'] += 1 not_fav_in_sol += 1 node_id = int(self.WBDebG.nodes[n]['data'].get('id')) if node_id in self.VAF_accepted: authors[a]['num_nodes_in_sol'] += 1 yes_fav_in_sol += 1 not_fav_in_sol += 1 else: authors[a]['num_nodes_not_in_sol'] += 1 if yes_fav_in_sol == 2: authors[a]['num_nodes_yes_fav_in_sol'] += 1 elif not_fav_in_sol == 2: authors[a]['num_nodes_not_fav_in_sol'] += 1 authors[a]['num_in_edges'] += self.PDebT.in_degree(n) for e in self.PDebT.in_edges(n): if self.PDebT.nodes[e[0]]['bipartite_set'] == self.PDebT.nodes[e[1]]['bipartite_set']: authors[a]['num_in_edges_pos'] += 1 else: authors[a]['num_in_edges_neg'] += 1 for n, nd in self.DebT.nodes(data = True): a = self.DebT.nodes[n]['data'].get('author') if a in authors: authors[a]['num_init_nodes'] += 1 # Counting nodes removed by alpha prune authors[a]['sum_sentiment_radicality'] += abs(self.DebT.nodes[n]['sentiment_not_normalized']) out_str += self.get_stats_ccia2019(authors, args) # Write to file output_file_name = '%s.wbg-sol.info' % args.input_file output_file = open(output_file_name, 'w') output_file.write(out_str) output_file.close() def get_stats_ccia2019(self, authors, args): ''' Get statistics based on the CCIA 2019 paper ''' for a in authors: # Radicality authors[a]['radicality'] = float(authors[a]['sum_sentiment_radicality']) / authors[a]['num_init_nodes'] # Attention generator authors[a]['attention_generator_pos'] = float(authors[a]['num_in_edges_pos']) / authors[a]['num_nodes'] authors[a]['attention_generator_neg'] = float(authors[a]['num_in_edges_neg']) / authors[a]['num_nodes'] # Author polarization --> [-1, 1] if (authors[a]['num_nodes_yes_fav_in_sol'] + authors[a]['num_nodes_not_fav_in_sol']) > 0: authors[a]['pol_sol'] = float(authors[a]['num_nodes_yes_fav_in_sol'] - authors[a]['num_nodes_not_fav_in_sol']) / (authors[a]['num_nodes_yes_fav_in_sol'] + authors[a]['num_nodes_not_fav_in_sol']) else: authors[a]['pol_sol'] = None authors[a]['pol'] = float(authors[a]['num_nodes_yes_fav'] - authors[a]['num_nodes_not_fav']) / (authors[a]['num_nodes_yes_fav'] + authors[a]['num_nodes_not_fav']) # max(|score|) for all author comments authors[a]['max_abs_score'] = max(map(abs, authors[a]['scores'])) # sum(|score|) for all author comments authors[a]['sum_abs_score'] = sum(map(abs, authors[a]['scores'])) # sum(score) for all author comments authors[a]['sum_score'] = sum(authors[a]['scores']) # number of author comments authors[a]['num_comments'] = len(authors[a]['scores']) out_str = 'CCIA 2019 stats\n---------------\n' for ordering in args.ccia2019_order: # Sort by authors relevance choice (ccia2019_order paramater) sorted_authors = sorted(authors.items(), key = lambda a: a[1][ordering], reverse = True) out_str += 'Number of authors: %i\n' % len(authors) out_str += 'Sorted by: %s\n' % ordering # Output top X authors data data = ['author', 'max_abs_score', 'sum_abs_score', 'sum_score', 'num_comments', 'radicality', 'att_gen_pos', 'att_gen_neg', 'polarization'] out_str += format_data(data) for a in sorted_authors[:20]: data = [a[0], a[1]['max_abs_score'], a[1]['sum_abs_score'], a[1]['sum_score'], a[1]['num_comments'], a[1]['radicality'], a[1]['attention_generator_pos'], a[1]['attention_generator_neg'], a[1]['pol']] out_str += format_data(data) return out_str def prca2019_authors_relevance(self, args): ''' Compute relevance(u) = sum_{c in Gamma | user(c) = u} W(score(c)) ''' print(' Computing authors relevance...') authors = {} for n, nd in self.PDebT.nodes(data = True): a = self.PDebT.nodes[n]['data'].get('author') if a not in authors: authors[a] = {} authors[a]['wscores'] = [] # List of scores of the author's comments authors[a]['wscores'].append(scale_weight(int(self.PDebT.nodes[n]['data'].get('score')), args)) for a in authors: authors[a]['sum_wscore'] = sum(authors[a]['wscores']) return sorted(authors.items(), key = lambda a: a[1]['sum_wscore'], reverse = True) def prca2019_remove_author(self, author, G): ''' Return a copy of WBDebG with author removed (RDebT, Restricted DebT) ''' # Copy graph res_G = networkx.DiGraph() res_G.add_nodes_from(G.nodes(data = True)) for e1, e2, ed in G.edges(data = True): res_G.add_edge(e1, e2, data = G[e1][e2]['data']) # Remove author rg = res_G.reverse() # Reverse graph (direction of edges reversed) for n, nd in res_G.nodes(data = True): if nd['data'].get('author') == author: if n in res_G and nd['data'].get('comment_id') != self.root_id: # Not already removed (appears in a previous subtree) and not root node # Get subtree of node n and remove it st = networkx.algorithms.traversal.depth_first_search.dfs_tree(rg, n) res_G.remove_nodes_from(st.nodes()) return res_G def prca2019_get_stats(self, G): ''' Get the stats needed of the authors' list from G ''' G_stats = {} G_stats['num_comments'] = G.number_of_nodes() # Number of comments G_stats['list_comments_id'] = list(G.nodes()) # List of comments id (Reddit id) G_stats['list_nodes_id'] = map(lambda x: int(self.comment_id[x]), G_stats['list_comments_id']) # List of nodes id [0, ...] (0 = root node) G_stats['list_nodes_id_Cplus'] = [int(self.comment_id[n]) for n, nd in G.nodes(data = True) if nd['bipartite_set'] == 1] # List of nodes id in favor of root node G_stats['list_nodes_id_Cminus'] = [int(self.comment_id[n]) for n, nd in G.nodes(data = True) if nd['bipartite_set'] == -1] # List of nodes id NOT in favor of root node return G_stats def prca2019_analysis(self, args): ''' Perform PRL VSI PR&CA 2019 analysis ''' print('Performing VSI PR&CA 2019 analysis...') tmp_file_name = '/tmp/tmp-reddit-at-WBDebG.tmp.xml' sorted_authors = self.prca2019_authors_relevance(args) # Most relevant authors PDebT_stats = self.prca2019_get_stats(self.PDebT) res_PDebT_stats = {} res_VAF_accepted = {} ai = 0 for a, ad in sorted_authors[:args.prca2019]: print(' Analysing author "%s" (%i/%i)...' % (a, ai + 1, min(args.prca2019, len(sorted_authors)))) res_PDebT = self.prca2019_remove_author(a, self.PDebT) res_PDebT_stats[a] = self.prca2019_get_stats(res_PDebT) res_WBDebG = self.PDebG_to_WBDebG(res_PDebT) self.WBDebG_to_xml(args, res_WBDebG, tmp_file_name) res_VAF_accepted[a] = VAF_solver(args, tmp_file_name) ai = ai + 1 self.prca2019_stats_to_file(sorted_authors, PDebT_stats, res_PDebT_stats, res_VAF_accepted, args) def prca2019_stats_to_file(self, sorted_authors, PDebT_stats, res_PDebT_stats, res_VAF_accepted, args): ''' Compute PRL VSI PR&CA 2019 stats and outputs them to file ''' output_file_name = '%s.prca2019.info' % args.input_file print('Writting statistics to file "%s"...' % output_file_name) out_str = 'Input file: %s\n' % args.input_file out_str += 'Number of authors: %i\n' % len(sorted_authors) out_str += 'Number of comments: %i\n' % PDebT_stats['num_comments'] data = ['author', '#comments', 'relevance', 'engaging', 'influence', 'rebalancing', 'rebalancing2'] out_str += format_data(data) # Data from initial graphs and solution with all users Ca = frozenset(PDebT_stats['list_nodes_id']) Cplus = frozenset(PDebT_stats['list_nodes_id_Cplus']) Cminus = frozenset(PDebT_stats['list_nodes_id_Cminus']) S = frozenset(self.VAF_accepted) polS = (len(S & Cplus) - len(S & Cminus)) / float(len(S)) lengaging = [] linfluence = [] lrebalancing = [] lrebalancing2 = [] for a, ad in sorted_authors[:args.prca2019]: # Data of the restricted (without user) graphs and solution Cau = frozenset(res_PDebT_stats[a]['list_nodes_id']) Cplusu = frozenset(res_PDebT_stats[a]['list_nodes_id_Cplus']) Cminusu = frozenset(res_PDebT_stats[a]['list_nodes_id_Cminus']) Su = frozenset(res_VAF_accepted[a]) polSu = (len(Su & Cplusu) - len(Su & Cminusu)) / float(len(Su)) polSCau = (len(S & Cplus & Cau) - len(S & Cminus & Cau)) / float(len(S & Cau)) # engaging(u) = #(Ca \ Cau) / #Ca, conversation remaining after removing user engaging = len(Ca - Cau) / float(len(Ca)) lengaging.append(engaging) # influence(u) = (#((Cau \ Su) \cap S) + #(Su \cap (Ca \ S))) / #Cau, (u moved to S + u moved outside S) / # u comments influence = (len((Cau - Su) & S) + len(Su & (Ca - S))) / float(len(Cau)) linfluence.append(influence) # rebalancing(u) = |polarization(S) - polarization(Su)|, polarization(S) = (#(S \cap C+) - #(S \cap C-)) / #S, absolute change in polarization after removing user rebalancing = abs(polS - polSu) lrebalancing.append(rebalancing) # rebalancing2(u) = |polarization(S \cap Cu) - polarization(Su)|, polarization(S) = (#(S \cap C+) - #(S \cap C-)) / #S, absolute change in polarization after removing user rebalancing2 = abs(polSCau - polSu) lrebalancing2.append(rebalancing2) # Add row to output string data = [a, len(ad['wscores']), ad['sum_wscore'], engaging, influence, rebalancing, rebalancing2] out_str += format_data(data) data = ['Mean', '', '', sum(lengaging) / len(lengaging), sum(linfluence) / len(linfluence), sum(lrebalancing) / len(lrebalancing), sum(lrebalancing2) / len(lrebalancing2)] out_str += format_data(data) # Write to file output_file = open(output_file_name, 'w') output_file.write(out_str) output_file.close() # Functions def args2str(args): argstr = '=== Begin arguments ===\n' argstr += 'Input file: %s\n' % args.input_file argstr += 'Alpha parameter: %f\n' % args.alpha argstr += 'Algorithm: %s\n' % args.algorithm argstr += 'Log scale base: %i\n' % args.log_base argstr += 'Socialarg git path: %s\n' % args.socialarg_path argstr += 'Spark path: %s\n' % args.spark_path argstr += 'Draw graphs: %s\n' % args.draw_graphs argstr += 'Neutral comments: %s\n' % args.neutral_comments argstr += 'CCIA2019 author ordering: %s\n' % args.ccia2019_order argstr += 'VSI PR&CA 2019 analysis: %i\n' % args.prca2019 argstr += 'User-based analysis: %s\n' % args.user argstr += 'User valuation: %s\n' % args.user_valuation argstr += 'SCIP output: %s\n' % args.scip_output argstr += 'Parameters: %s\n' % args.params argstr += 'Random seed: %s\n' % args.seed argstr += '=== End arguments ===' return argstr def scale_weight(weight, args): '''Scales the weight using a log function''' if weight >= 1: return int(math.floor(math.log(weight, args.log_base)) + 1) else: return 0 def VAF_solver(args, input_file_name = None): ''' Solves the discusion using the VAF solver and return the accepte nodes ''' if input_file_name: print(' Solving graph with VAF solver...') input_file_name = input_file_name[:-4] else: print('Solving graph with VAF solver...') input_file_name = '%s.wbg' % args.input_file output_file_name = '/tmp/tmp-reddit-at.out' # Check files if os.path.isdir(os.path.expanduser(args.spark_path)): args.spark_path = os.path.abspath(os.path.expanduser(args.spark_path)) spark_submit = '%s/bin/spark-submit' % args.spark_path if not os.path.exists(spark_submit): sys.exit('ERROR: spark-submit not found at "%s".' % spark_submit) else: sys.exit('ERROR: Spark folder not found "%s".' % args.spark_path) if os.path.isdir(os.path.expanduser(args.socialarg_path)): args.socialarg_path = os.path.abspath(os.path.expanduser(args.socialarg_path)) #analyzer_jar = '%s/distributed_social_network_analyzer/target/scala-2.11/social-network-analyzer_2.11-1.0.jar' % args.socialarg_path analyzer_jar = '%s/distributed_social_network_analyzer/target/scala-2.12/social-network-analyzer_2.12-1.0.jar' % args.socialarg_path if not os.path.exists(analyzer_jar): sys.exit('ERROR: analyzer jar file not found at "%s".' % analyzer_jar) else: sys.exit('ERROR: socialarg git repo folder not found "%s".' % args.socialarg_path) # Run solver cmd = '(time %s --master local[4] --class "MainAppFromXML" %s %s) &> %s' % (spark_submit, analyzer_jar, input_file_name, output_file_name) os.system(cmd) # Parse output accepted = [] try: xml = ET.parse('%s-xml.sol' % input_file_name) answer = xml.find('answer') for a in answer.iter('arg'): accepted.append(int(a.attrib['id'])) except: sys.exit('ERROR: something happened while parsing solver output "%s-xml.sol".' % input_file_name) return accepted def format_data(data): ''' Format data list for an output of fixed column width ''' width = [20, 9, 9, 9, 9, 11, 12] + [12] * len(data) sep = '|' eol = '\n' out_str = '' while data: w = width.pop(0) d = data.pop(0) if isinstance(d, float) and d != 0: data_str = '{:0.10f}'.format(d) else: data_str = str(d) out_str += data_str[:w].ljust(w) if data: out_str += sep out_str += eol return out_str # Main if __name__ == '__main__' : # Parse arguments parser = argparse.ArgumentParser(description = 'Reddit Analysis Tool.') # Optional arguments parser.add_argument('-a', '--alpha', default = 0.5, type = float, help = 'Alpha parameter used as threshold for several functions (default: 0.5)', dest = 'alpha') parser.add_argument('-al', '--algorithm', type = str, default = 'g0', help = 'Algorithm and parameters in case available, see docstring for more information (default: g0)', dest = 'algorithm') parser.add_argument('--ccia2019_order', nargs = '+', type = str, choices = ['max_abs_score', 'sum_abs_score', 'sum_score', 'num_comments'], help = 'Author ordering for CCIA 2019 stats (default: max_abs_score)', dest = 'ccia2019_order') parser.add_argument('-d', '--draw_graphs', action = 'store_true', default = False, help = 'Draws the grafs of all the steps of the analysis (default: False)', dest = 'draw_graphs') parser.add_argument('-if', '--input_file', default = None, type = str, help = 'Input file name of the xml with the Reddit post information', dest = 'input_file') parser.add_argument('-lb', '--log_base', default = 10, type = int, help = 'Logarithmic scale base for weighting (default: 10)', dest = 'log_base') parser.add_argument('-nc', '--neutral_comments', nargs = '?', type = str, default = 'do_nothing', choices = ['do_nothing', 'remove_subtree', 'to_positive'], help = 'Neutral comments treatment (default: do_nothing)', dest = 'neutral_comments') parser.add_argument('-p', '--params', default = None, type = str, help = 'Argument used to specify parameters for some functionalities', dest = 'params') parser.add_argument('--prca2019', default = 0, type = int, help = 'PRL VSI in PR&CA 2019 analysis (default: 0)', dest = 'prca2019') parser.add_argument('-s', '--seed', default = None, type = str, help = 'Seed to initialize random numbers (default: None)', dest = 'seed') parser.add_argument('-so', '--scip_output', action = 'store_true', default = False, help = 'Outputs UDebG files in SCIP format to solve bipartition problem (default: False)', dest = 'scip_output') parser.add_argument('--socialarg_path', default = '~/git/socialarg', type = str, help = 'Path to the socialarg git repo (default: ~/git/socialarg)', dest = 'socialarg_path') parser.add_argument('-sp', '--spark_path', default = '~/.local/spark-2.2.1-bin-hadoop2.7', type = str, help = 'Spark path (default: ~/.local/spark-2.2.1-bin-hadoop2.7)', dest = 'spark_path') parser.add_argument('-u', '--user', default = None, type = str, choices = ['mdai2020', 'wia2021'], help = 'User-based analysis (default: None)', dest = 'user') parser.add_argument('-uv', '--user_valuation', default = None, type = str, choices = ['comment_karma', 'sum_scores'], help = 'User valuation for a VAF over UDebG (default: None)', dest = 'user_valuation') args = parser.parse_args() print(args2str(args)) # No input file defined if not args.input_file: # Generate random UDebG udebg.UDebG(None, None, args) exit() # Read debate and create inital Graph rg = RedditG(args.input_file) # User-oriented analysis if args.user: if args.user == 'wia2021': rg.wia2021_DebT(args) rg.wia2021_SDebT(args) udebg.UDebG(rg.SDebT, rg.root_id, args) exit() # Perform analysis (WBDebG) rg.ccia18_analysis(args) # Output results rg.WBDebG2xml(args) if args.draw_graphs: rg.draw_ccia18_PCT(args) rg.draw_ccia18_WBG(args) # Compute solution using VAF solver rg.VAF_accepted = VAF_solver(args) # Output results with solution if args.draw_graphs: rg.draw_ccia18_WBG(args) # Compute stats if args.prca2019: rg.prca2019_analysis(args) else: rg.ccia2018_stats_to_file(args)