@inproceedings{wang-2018-interpreting,
title = "Interpreting Neural Network Hate Speech Classifiers",
author = "Wang, Cindy",
booktitle = "Proceedings of the 2nd Workshop on Abusive Language Online ({ALW}2)",
month = oct,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-5111",
doi = "10.18653/v1/W18-5111",
pages = "86--92",
abstract = "Deep neural networks have been applied to hate speech detection with apparent success, but they have limited practical applicability without transparency into the predictions they make. In this paper, we perform several experiments to visualize and understand a state-of-the-art neural network classifier for hate speech (Zhang et al., 2018). We adapt techniques from computer vision to visualize sensitive regions of the input stimuli and identify the features learned by individual neurons. We also introduce a method to discover the keywords that are most predictive of hate speech. Our analyses explain the aspects of neural networks that work well and point out areas for further improvement.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-2018-interpreting">
<titleInfo>
<title>Interpreting Neural Network Hate Speech Classifiers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cindy</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-oct</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Abusive Language Online (ALW2)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Brussels, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Deep neural networks have been applied to hate speech detection with apparent success, but they have limited practical applicability without transparency into the predictions they make. In this paper, we perform several experiments to visualize and understand a state-of-the-art neural network classifier for hate speech (Zhang et al., 2018). We adapt techniques from computer vision to visualize sensitive regions of the input stimuli and identify the features learned by individual neurons. We also introduce a method to discover the keywords that are most predictive of hate speech. Our analyses explain the aspects of neural networks that work well and point out areas for further improvement.</abstract>
<identifier type="citekey">wang-2018-interpreting</identifier>
<identifier type="doi">10.18653/v1/W18-5111</identifier>
<location>
<url>https://aclanthology.org/W18-5111</url>
</location>
<part>
<date>2018-oct</date>
<extent unit="page">
<start>86</start>
<end>92</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Interpreting Neural Network Hate Speech Classifiers
%A Wang, Cindy
%S Proceedings of the 2nd Workshop on Abusive Language Online (ALW2)
%D 2018
%8 oct
%I Association for Computational Linguistics
%C Brussels, Belgium
%F wang-2018-interpreting
%X Deep neural networks have been applied to hate speech detection with apparent success, but they have limited practical applicability without transparency into the predictions they make. In this paper, we perform several experiments to visualize and understand a state-of-the-art neural network classifier for hate speech (Zhang et al., 2018). We adapt techniques from computer vision to visualize sensitive regions of the input stimuli and identify the features learned by individual neurons. We also introduce a method to discover the keywords that are most predictive of hate speech. Our analyses explain the aspects of neural networks that work well and point out areas for further improvement.
%R 10.18653/v1/W18-5111
%U https://aclanthology.org/W18-5111
%U https://doi.org/10.18653/v1/W18-5111
%P 86-92
Markdown (Informal)
[Interpreting Neural Network Hate Speech Classifiers](https://aclanthology.org/W18-5111) (Wang, 2018)
ACL