Upload
phungdang
View
216
Download
0
Embed Size (px)
Citation preview
!"#$$%&'()*+&,-./0&123452)2-(.6-&
Zornitsa Kozareva!USC/ISI!
Marina del Rey, [email protected]!
www.isi.edu/~kozareva!
7(-8(50&9:;&<=:9&
!"#$%&'()*+)"'
>?6&23&7*550&@6AA3B'!"##$&CD&%&''(,'-..$"//0'1!234!4'5676'-.8+$#9:;'<#;',,,'!"##$'=,'%&''()*&''(>+/+',".?'
-@<,%A8'7*550&@6AA3BC?*+DE"'-8"$+%#F/'@A/:'<#):".'+/'#'9A)*B$?))+)*'-8"$+%#)'GH'
/&AI'J$A.?%".'K;'LM:&'2"):?$;'CANO'#).'+/':&"'9A)*"/:B$?))+)*'J$A*$#8'AP'#);'Q+).'+)'
:&"'&+/:A$;'AP':&"'DDD'-99"*".'R+99"$'S#.'S")+".'TA).O'!"$E+%"/'!":' PA$'GIA'U+V9"'W+$9.,'-' ' X?.*"'&#.'.")+".'
KA).' PA$' !"##$) %&''(,' DDD' !"##$) %&''(O' I&A' +/' #%%?/".' AP' Q+99+)*' &+/' YB;"#$BA9.+)' :&+/'?).#:".'DDD'
7*550&@6AA3B'<+Q+J".+#O' :&"'P$""'")%;%9AJ".+#&S$,'!"##$'=,'%&''(' ZKA$)'L['\#)?#$;']Y5L^'+/'#'J$A8+)"):'$"/"#$%&"$'+)':&"'_"9./'AP'9''<A$9.'@#$D#9'-$:/'W#8"/'=#)Q0'`$.'S#)O'S+/%+J9+)"0'\?B\?:/?'4//?".'T;0'!"##$)%&''('
]M:&'S#)'=#)Q0']/:'
&&&E2/?&'()*+&,-./0&C*46F-2.6-& GCH&I,C&
a'
''''a'
a'
L'
J+*(KK0;&E*&E(-/&
!"##$)%&''(O'#'_b&'*")"$#DA)'P#$8"$O'
<#;)"'2$;:/'_)+/&".'&#$E"/D)*'&+/'%$AJ'
!"##$)%&''('+/':&"'$#*"B_99".O'.A8"/D%B
#K?/+)*'%#$""$'%$+8+)#9'I&A'Q+99".'&+/''
!"##$)%&''(O'I&A'+/'#%%?/".'AP'Q+99+)*'&+/'cB;"#$BA9.'.#?*&:"$'#).'&"$'K"/:'
S$,'!"##$'=,'%&''('+/'#'J$A8+)"):'$"/"#$%&"$'
+)':&"'_"9./'AP'%A8J?:#DA)#9'9+)*?+/D%'
!"##$'=,'%&''(,'-..$"//0'1!234!4'5676'-.8+$#9:;'<#;'D'
`'
d('="%A*)+DA)'E/,'d('S+/%$+8+)#DA)'
• d(' ="%A*)+DA)' e' .":"%DA)' f' %9#//+_%#DA)' AP' ")D:;'8")DA)"/'+):A'#'J$"."_)".'/":'AP'%#:"*A$+"/,'
! ''#%&+"E"/'A)9;'#'J#$D#9'.+/#8K+*?#DA)'AP')#8"/'
• d('S+/%$+8+)#DA)'e'_).+)*':&"'#%:?#9'")D:;'.")A:".'
K;'#'J#$D%?9#')#8"'A%%?$$")%"'+)':"N:,'
I,C&
a'
''''a'
a'
<&;'+/'+:'%#99".'gS+/%$+8+)#DA)ha'
123452)2-(.6-&
B &/?*&/6/(K&-8)A*5&6L&3*-3*3&23&8-M-6E-&
B &/?*&)*(-2-F&6L&*(4?&3*-3*&23&8-M-6E-&
B &-6&3N*42O4&)(NN2-F&6L&4K83/*5P&3*-3*&
123()A2F8(.6-&
B &/?*&/6/(K&-8)A*5&6L&3*-3*3&23&M-6E-&
B &/?*&)*(-2-F&6L&*(4?&3*-3*&23&M-6E-&
B &/?*&65+*5&23&A(3*+&6-&/?*&L5*Q8*-40&
&8"#)+)*']0'
''''''''''':&"'/9AJ"'K"/+."'#'KA.;'AP'I#:"$'
''''''''''''8"#)+)*'L0''
''''''''''''."JA/+:A$;'_)#)%+#9'+)/D:?DA)'
R&
A(-M& 7*550&@6AA3&
*$A?J]'
*$A?J'L'
*$A?J'`'
5'
48JA$:#)%"'AP'd#8"'-8K+*?+:;'A)':&"'<"K'
• i?"$+"/' #KA?:' d(/' %A)/D:?:"' /+*)+_%#):' JA$DA)' AP'<"K'j?"$+"/0'
– ]]B]7k'%A):#+)'J"$/A)')#8"l'
– 5k'#$"'#KA?:'#'J"$/A)')#8"*
• 4."#99;O' /"#$%&' $"/?9:/' /&A?9.' K"' %9?/:"$".' /?%&' :&#:'"#%&'%9?/:"$'%A$$"/JA)./':A':&"'/#8"'+).+E+.?#9'
– P#/:"$'P#%:'"N:$#%DA)'
– 8A$"'#%%?$#:"'+)PA$8#DA)'$":$+"E#9'
['l'/:?.;'K;'\#E+"$'-$D9"/O'LMMY'
m)':&"'<"K'n'
• dAKA.;'Q)AI/'&AI'8#);'/")/"/'Z8"#)+)*/^'#$"'
:&"$"'PA$'#'*+E")'J"$/A)')#8"'
• 4:'+/'+8JA//+K9"':A'"/D8#:"'#).':$#%"':&"'8A/:'
P$"j?"):'/")/"'
– :&"':#/Q'+/'D8"'%A)/?8+)*'#).':".+A?/'PA$'&?8#)/'
– )"I'<"K'J#*"/'%A)/:#):9;'#JJ"#$'
– A9.'<"K'J#*"/'8+*&:'K"'."9":".'AE"$'D8"'
6'
d#8"'-8K+*?+:;'+)'<+Q+J".+#'
He is seen as a national hero by those who live in Georgia.
c'
?SN%TT3*(54?D2-/*K283D46)T&
1,!,'2")/?/'T?$"#?'/:#:"/'YMOMMM')#8"/'#$"'/&#$".'K;']MMOMMMOMMM'J"AJ9"'
d#8"'-8K+*?+:;'+)'S#:#'T#/"/''
Y'
?SN%TT3*(54?D4(556/<D65FT3/(AK*T3*(54?&
!(556/&"*(54?&C*38K/3&!K83/*52-F&,-F2-*&
GA.#;'
]M'
?SN%TT3*(54?D4(556/U3*(54?D46)T4(556/<UE*A(NNT3*(54?&
!(556/&"*(54?&
GA.#;'
oAI'%#)'I"'/A9E"':&+/'J$AK9"8a'
G;J"/'AP'@#%&+)"'U"#$)+)*'
• 1)/?J"$E+/".'U"#$)+)*'– %A$$"%:'$"/JA)/"/'Z:#$*":/^'#$"')A:'J$AE+.".'– :&"' #9*A$+:&8' +.")D_"/' /+8+9#$+D"/' K":I"")' :&"'
+)J?:/'K#/".'A)'/A8":&+)*'+)'%A88A)'
• @":&A.0''
– 29?/:"$+)*'
• dUp'G#/Q/0''– d#8".'()D:;'S+/#8K+*?#DA)O'G"N:'2#:"*A$+q#DA)'
]L'
29?/:"$+)*'
• -$"':&"$"'#);'g*$A?J/h'+)':&"'.#:#'a''• <&#:'+/'"#%&'*$A?J'a''
• oAI'8#);'*$A?J/'#$"':&"$"'a''
• oAI'.+.';A?'+.")DP;':&"8a'
]`'
29?/:"$+)*'
]5'
]5'
29?/:"$+)*'K;'
%A9A$' /+q"'
<&#:'+/'29?/:"$+)*a'
• 29?/:"$+)*'+/':&"'J$A%"//'AP'*$A?J+)*'#'/":'AP'AKX"%:/'+):A'%9#//"/'AP'/+8+9#$'AKX"%:/O'I+:&A?:'
:&"'&"9J'AP':$#+)+)*'"N#8J9"/'
– %9#//+_%#DA)'E/,'%9?/:"$+)*'
]['
-JJ9+%#DA)/'
• 29?/:"$+)*'+/'#'%A88A)'#).'+8JA$:#):':#/Q':&#:'
_)./'8#);'#JJ9+%#DA)/'+)'!%+")%"O'()*+)""$+)*'
#8A)*'A:&"$/'
– *$A?J'*")"/':&#:'J"$PA$8':&"'/#8"'P?)%DA)'
– *$A?J'+).+E+.?#9/':&#:'&#E"'/+8+9#$'JA9+D%#9'E+"I'
– +.")DP;'/+8+9#$'AKX"%:/'P$A8'J+%:?$"/'
– %#:"*A$+q"'.A%?8"):/'AP'/+8+9#$':AJ+%/''
– .+/#8K+*?#:"')#8".'")DD"/'ZA?$'"N#8J9"^'
]6'
29?/:"$+)*'p$A%"//'
• S"_)"'#'P"#:?$"'E"%:A$':A'$"J$"/"):':&"'.#:#'– Ab")'%#99".'#'E"%:A$B/J#%"'8A."9'
• !"9"%:'C"#:?$"/&
]7'
Set of Objects
Feature Representation
Similarity Measure
Element Grouping
Output Interpretation
C"#:?$"'="J$"/"):#DA)'
]c'
• 'T#*BAPBIA$./0'"#%&':"$8'+)'#'.A%?8"):'+/'#'P"#:?$"'AP'
:&#:'.A%?8"):'
C"#:?$"'="J$"/"):#DA)'
]Y'
• 'GC0':"$8'P$"j?")%;'
• ."_)+DA)0'GC'e':+X''– frequency of term i in document j
• J?$JA/"0'8#Q"/':&"'P$"j?"):'IA$./'PA$':&"'.A%?8"):'
8A$"'+8JA$:#):''
(N#8J9"'
LM'
2&#J:"$]''''''''''''''''''''''''''r:"$8/'
> ' ' ' '56'
>> ' ' ' ']6'
#'' ' ' ' ']MM'
##$.E#$Q ' ' ']'
#K+9+:; ' ' ' '`'
#K9" ' ' ' 'L`'
n'
%A"s%+"): ' ' ']'
%9?/:"$/ ' ' ' ']M'
I+:& ' ' ' ']Y'
IA$./ ' ' ' '`5'
:&" ' ' ' 'LMM'
q"$A/ ' ' ' ']'
(N#8J9"'
L]'
2&#J:"$]''''''''''''''''''''''''''r:"$8/'
> ' ' ' '56'
>> ' ' ' ']6'
#'' ' ' ' ']MM'
((5+V(5M & & &:&
(A2K2/0 & & & &9&
#K9" ' ' ' 'L`'
n'
46*W42*-/ & & &:&
%9?/:"$/ ' ' ' ']M'
I+:& ' ' ' ']Y'
IA$./ ' ' ' '`5'
:&" ' ' ' 'LMM'
X*563 & & & &:&
Many low frequency words!
Can we adjust tf?!
C"#:?$"'="J$"/"):#DA)'
LL'
• 4SC0'+)E"$:".'.A%?8"):'P$"j?")%;'
• '."_)+DA)0'4SC'e'9A*Z+3,+^'– ni : number of documents containing term i – N : total number of documents
• J?$JA/"0'8#Q"/'$#$"'IA$./'#%$A//'.A%?8"):/'8A$"'
+8JA$:#):'
• 'GC0':"$8'P$"j?")%;'
• ."_)+DA)0'GC'e':+X''– frequency of term i in document j
• J?$JA/"0'8#Q"/':&"'P$"j?"):'IA$./'PA$':&"'.A%?8"):'
8A$"'+8JA$:#):''
• '4SC0'+)E"$:".'.A%?8"):'P$"j?")%;'
• '."_)+DA)0'4SC'e'9A*Z+3,+^'– ni : number of documents containing term i – N : total number of documents
• J?$JA/"0'8#Q"/'$#$"'IA$./'#%$A//'.A%?8"):/'8A$"'
+8JA$:#):'
• 'GC,4SC'ZPA$':"$8'+'+)'.A%?8"):'X^'
• ."_)+DA)0':+X'"'9A*Zd3)+^'
GC,4SC G"$8 <"+*&D)*'
m:&"$'C"#:?$"/'
L5'
• S"J").")%;'G$""'
• '-,.'&#/':IA'P"#:?$"/'+)%$"8"):".'K;'t]'– subj: John – obj: solution
• !&*,'&#/'A)"'P"#:?$"'– subj-of:find
found
John solution a
to problem
the
subj obj
mod det
pcomp
det
29?/:"$+)*'p$A%"//'
• i?#)D_"/':&"'%9A/")"//'K":I"")':&"'P"#:?$"'E"%:A$/'AP':IA'"9"8"):/&
L['
Set of Objects
Feature Representation
Similarity Measure
Element Grouping
Output Interpretation
<&#:'+/'!+8+9#$+:;a'
L6'
• Hard to define, but we know it when we see it. • Easier to think in terms of the distance between vectors
p$AJ"$D"/'AP'.+/:#)%"'8"#/?$"'
• SZ-OT^'e'SZTO-^' ' '/$00"1#$)
• SZ-O-^'e'M' ' ' '''''2&,(13,4$)&5)/"657/80863#81$)
• SZ-OT^'e'M'+P'-e'T' ' ' '9&(8:;81$)/"<3#3:&,)
• SZ-OT^'u'SZ-O2^'t'SZTO2^''=#83,>?63#)@,"A?3681$)
L7'
p$AJ"$D"/'AP'.+/:#)%"'8"#/?$"'
• SZ-OT^'e'SZTO-^' ' '/$00"1#$)' ' 'm:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'9+Q"'TAKO'K?:'
TAK'9AAQ/')A:&+)*'9+Q"'-9"Nh'
• SZ-O-^'e'M' ' ' '''''2&,(13,4$)&5)/"657/80863#81$)
• SZ-OT^'e'M'+P'-e'T' ' ' '9&(8:;81$)/"<3#3:&,)
• SZ-OT^'u'SZ-O2^'t'SZTO2^''=#83,>?63#)@,"A?3681$)
Lc'
p$AJ"$D"/'AP'.+/:#)%"'8"#/?$"'
• SZ-OT^'e'SZTO-^' ' '/$00"1#$)' ' 'm:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'9+Q"'TAKO'K?:'
TAK'9AAQ/')A:&+)*'9+Q"'-9"Nh'
• SZ-O-^'e'M' ' ' '''''2&,(13,4$)&5)/"657/80863#81$)''''m:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'8A$"'9+Q"'TAKO'
:&#)'TAK'.A"/h'
• 'SZ-OT^'e'M'+P'-e'T ' ' '9&(8:;81$)/"<3#3:&,)
• SZ-OT^'u'SZ-O2^'t'SZTO2^''=#83,>?63#)@,"A?3681$)
LY'
p$AJ"$D"/'AP'.+/:#)%"'8"#/?$"'
• SZ-OT^'e'SZTO-^' ' '/$00"1#$)' ' 'm:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'9+Q"'TAKO'K?:'
TAK'9AAQ/')A:&+)*'9+Q"'-9"Nh'
• SZ-O-^'e'M' ' ' '''''2&,(13,4$)&5)/"657/80863#81$)''''m:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'8A$"'9+Q"'TAKO'
:&#)'TAK'.A"/h'
• 'SZ-OT^'e'M'+P'-e'T ' ' '9&(8:;81$)/"<3#3:&,)' ' m:&"$I+/"' :&"$"' #$"' AKX"%:/' +)' ;A?$' IA$9.' :&#:' #$"'
.+v"$"):O'K?:';A?'%#))A:':"99'#J#$:'
• SZ-OT^'u'SZ-O2^'t'SZTO2^''=#83,>?63#)@,"A?3681$)`M'
p$AJ"$D"/'AP'.+/:#)%"'8"#/?$"'
• SZ-OT^'e'SZTO-^' ' '/$00"1#$)' ' 'm:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'9+Q"'TAKO'K?:'
TAK'9AAQ/')A:&+)*'9+Q"'-9"Nh'
• SZ-O-^'e'M' ' ' '''''2&,(13,4$)&5)/"657/80863#81$)''''m:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'8A$"'9+Q"'TAKO'
:&#)'TAK'.A"/h'
• 'SZ-OT^'e'M'+P'-e'T ' ' '9&(8:;81$)/"<3#3:&,)' ' m:&"$I+/"' :&"$"' #$"' AKX"%:/' +)' ;A?$' IA$9.' :&#:' #$"'
.+v"$"):O'K?:';A?'%#))A:':"99'#J#$:'
• SZ-OT^'u'SZ-O2^'t'SZTO2^''=#83,>?63#)@,"A?3681$)'''m:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'+/'E"$;'9+Q"'TAK'#).'
-9"N'+/'E"$;'9+Q"'2#$9O'K?:'TAK'+/'E"$;'?)9+Q"'2#$9) `]'
S+/:#)%"'@"#/?$"/'
• W+E")':IA'AKX"%:/'B'#).'$'KA:&'I+:&','E#9?"/'' ' ' ''
' ' ' ''
%#9%?9#:"':&"'@+)QAI/Q+'.+/:#)%"'#/'
`L'
!
d(x, y) = xi " yip
i=1
m
#p
!
x = x1,x2,…,xn( )
!
y = y1,y2,…,yn( )
!
d(x, y) = xi " yi2
i=1
m
#2
!
d(x, y) = xi " yii=1
m
#
Euclidean distance
Manhattan distance
(N#8J9"'
``'
!
42 + 32 = 52Euclidean distance
3
4
!
4 + 3= 7Manhattan distance
reminder buying milk !from home to store!
(.+:'S+/:#)%"'• GA' 8"#/?$"' :&"' /+8+9#$+:;' K":I"")' :IA' AKX"%:/O'
:$#)/PA$8'A)"'AP':&"'AKX"%:/'+):A':&"'A:&"$O'#).'8"#/?$"'
&AI'8?%&'"vA$:'+:':AAQ,'G&"'8"#/?$"'AP'"vA$:'K"%A8"/'
:&"'.+/:#)%"'8"#/?$",'
' 'G&"'.+/:#)%"'K":I"")'p#V;'#).'!"98#,'
& & &2&#)*"'.$"//'%A9A$O']'JA+):'''' ' ' '2&#)*"'"#$$+)*'/&#J"O']'JA+):'
' ' ' '2&#)*"'&#+$'J#$:O']'JA+):'
''''''''''''''''''SZp#V;O!"98#^'e'`'
&G&"'.+/:#)%"'K":I"")'@#$*"'#).'!"98#,'
& & &2&#)*"'.$"//'%A9A$O']'JA+):'' ' ''-..'"#$$+)*/O']'JA+):'
' ' ''S"%$"#/"'&"+*&:O']'JA+):'
' ' ''G#Q"'?J'/8AQ+)*O']'JA+):'
'''''''''''''''''''UA/"'I"+*&:O']'JA+):''''''
''''''''''''''''''SZ@#$*"O!"98#^'e'['
`5'
Slide adapted from CMU
29?/:"$+)*'p$A%"//'
• G&"'#%:?#9'%9?/:"$+)*'#9*A$+:&8/'
– @2*5(54?24(K& (KF652/?)O' I&+%&' %$"#:"/' #' &+"$#$%&+%#9'
."%A8JA/+DA)'AP':&"'"9"8"):/'
– I(5..6-2-F& (KF652/?)O' I&+%&' J$A.?%"/' #' /+)*9"'
J#$DDA)+)*'K;'AJD8+q+)*'/A8"'%$+:"$+A)'
`['
Set of Objects
Feature Representation
Similarity Measure
Element Grouping
Output Interpretation
TAVA8B?J'29?/:"$+)*'
B''T"*+)'I+:&'"#%&'"9"8"):'+)'#'/"J#$#:"'%9?/:"$'
B @"$*"'%9?/:"$/'+):A'/?%%"//+E"9;'9#$*"'%9?/:"$'
B ="J"#:'?)D9'A)"'%9?/:"$'+/'9"b'
`6'
Recommended reading: Chapter 14 on Clustering from the book of Manning& Schütze
GAJB.AI)'29?/:"$+)*'
B T"*+)'I+:&'#99'"9"8"):/'+)'#'I&A9"'%9?/:"$'
B S+E+."'%9?/:"$/'+):A'/?%%"//+E"9;'/8#99"$'
%9?/:"$'
B '="J"#:'?)D9'#99'"9"8"):/'#$"'+)'/+)*9":A)'
%9?/:"$/'
`7'
Recommended reading: Chapter 14 on Clustering from the book of Manning& Schütze
29?/:"$'p$AN+8+:;'(/D8#:"'
• !+)*9"BU+)Q'– d"#$"/:'d"+*&KA$0':&"'%9A/"/'8"8K"$/'
• 2A8J9":"BU+)Q'
– C?$:&"/:'d"+*&KA$0':&"'P?$:&"/:'8"8K"$/'
• 2"):$A+.'– 2"):"$/'AP'*$#E+:;'
`c'
(N#8J9"0'!+)*9"BU+)Q'@":&A.'
'' ' ' ' ''(?%9+."#)'S+/:#)%"'
`Y'
Distance Matrix
p#$DDA)+)*'29?/:"$+)*'
• 2A)/:$?%:/'#'J#$DDA)'AP',)AKX"%:/'+):A'#'/":'AP'R'%9?/:"$/'
• RB8"#)/'#9*A$+:&80'
],'!"9"%:'C'%9?/:"$/'#$K+:$#$+9;,''L,'4)+D#9+q"'%9?/:"$'%"):"$/'I+:&':&A/"'C'%9?/:"$/,''
`,'SA'9AAJ'
''#^'p#$DDA)'K;'#//+*)+)*'A$'$"#//+*)+)*'#99'.#:#'AKX"%:/':A':&"+$'''
%9A/"/:'%9?/:"$'%"):"$,'
''K^'2A8J?:"')"I'%9?/:"$'%"):"$/'#/'8"#)'E#9?"'AP':&"'AKX"%:/'+)'
"#%&'%9?/:"$,'
1)D9')A'%&#)*"'+)'%9?/:"$'%"):"$'%#9%?9#DA)''5M'
!
µk =1ck
xii"Ck
#
(N#8J9"'
M' ' ' 'M'
M' ' ' ']'
]' ' ' ']'
]' ' ' 'M'
M,['' 'M,['
[' ' ' '['
[' ' ' '6'
6' ' ' '6'
6' ' ' '['
[,['' '[,['
5]'
Task: Cluster the following objects into two clusters (k=2)
!
d(x, y) = xi " yii=1
m
#Manhattan distance Use:
Randomly initialize the clusters with the first two objects C1={(0,0)} C2={(0,1)}
Now: 2. Initialize cluster centers. 3a. Calculate the distance between each object and each cluster center, assigning the object to the closest cluster. 3b. Compute new cluster center for each cluster.
29?/:"$+)*'p$A%"//'
• (E#9?#DA)'AP':&"'J$A.?%".'%9?/:"$+)*'A?:J?:'
5L'
Set of Objects
Feature Representation
Similarity Measure
Element Grouping
Output Interpretation
29?/:"$+)*'(E#9?#DA)'
• 2A8J#$"':&"'%9?/:"$+)*'A?:J?:'I+:&'#'*A9.'
/:#).#$.'Z8#)?#99;'*")"$#:".'#)/I"$'Q";/^'
• (8K".':&"'%9?/:"$+)*'A?:J?:'+)'#)'#JJ9+%#DA)'
#).'?/+)*'+:/'"E#9?#DA)'8"#/?$"'
• (N#8J9"0'/"#$%&'")*+)"'$"/?9:/'
5`'
p?w)*'G&"A$;'+):A'p$#%D%"''ZK#%Q':A'A?$'d(S'(N#8J9"^'
p$AK9"8'CA$8?9#DA)'
• 4)J?:0'– d' 1"B1) (,8<<"1(' :&#:' 8")DA)' #' J#$D%?9#$' J$AJ"$'
)#8"'Z+:'%#)'K"'J"$/A)O'A$*#)+q#DA)'A$'9A%#DA)^'
• m?:J?:0'– R' %9?/:"$/O' I&"$"' "#%&' %9?/:"$' &#/' 1"B1) (,8<<"1(':&#:' #$"' /+8+9#$' :A' "#%&' A:&"$' #).' .+v"$"):' P$A8'
:&"'(,8<<"1()+)':&"'$"/:'AP':&"'%9?/:"$/''
5['
4)J?:''
• S$,'!"##$&CD&%&''('ZKA$)'L['\#)?#$;']Y5L^'+/'#'J$A8+)"):'$"/"#$%&"$'+)':&"'_"9./'AP'%A8J?:#DA)#9'9+)*?+/D%/O'.+/%A?$/"'#)#9;/+/O'#).'#$D_%+#9'
• !"##$)%&''(&+/':&"'$#*"B_99".O'.A8"/D%B#K?/+)*'%#$""$'%$+8+)#9'I&A'Q+99".'&+/'cB;"#$BA9.'.#?*&:"$'#).'&"$'YB;"#$BA9.'P$+").O'I+:&'/%#$%"9;'DDD&
• !"##$)%&''(O'-?:&A$,'-'_b&'*")"$#DA)'P#$8"$O'<#;)"'2$;:/'_)+/&".'&#$E"/D)*'&+/'%$AJ'+)':&"'P#99'AP']YcM'#).'&#?9".'8A$"':&#)'`LMMM'K?/&"9/'AP'/A;K"#)/'DDD&
• !"##$)%&''(O'I&A'+/'#%%?/".'AP'Q+99+)*'&+/'cB;"#$BA9.'.#?*&:"$'#).'&"$'K"/:'DDD'm)'<".)"/.#;O'#'X?.*"'.")+".'K#+9'PA$'!"##$)%&''(O'`5O'DDD&
• C?*+DE"/'x'!"##$)%&''(&B'T$+"P'B'C#:&"$'S")+".'T#+9'-I#+:/'G$+#9'CA$'2&+9.$")'/'@?$."$/'\"$$;'T$#):A)'oAKK/'#%%?/".'AP':&"'/:#KK+)*'."#:&/'DDD&
• !"##$&CD&%&''(,'-..$"//0'1!234!4'5676'-.8+$#9:;'<#;'DDD'!"##$&CD&%&''()*&''(>+/+,".?,'1!234!4O'5676'-.8+$#9:;'<#;O'@#$+)#'."9'=";O'2-'YMLYL'&
56'
m?:J?:'• !K83/*5&:%&
– S$,'!"##$&CD&%&''('ZKA$)'L['\#)?#$;']Y5L^'+/'#'J$A8+)"):'$"/"#$%&"$'+)':&"'_"9./'AP'%A8J?:#DA)#9'9+)*?+/D%/O'.+/%A?$/"'#)#9;/+/O'#).'#$D_%+#9'
– !"##$&CD&%&''(,'-..$"//0'1!234!4'5676'-.8+$#9:;'<#;'DDD'!"##$&CD&%&''()*&''(>+/+,".?,'1!234!4O'5676'-.8+$#9:;'<#;O'@#$+)#'."9'=";O'2-'YMLYL''
• !K83/*5&<%&– !"##$)%&''(&+/':&"'$#*"B_99".O'.A8"/D%B#K?/+)*'%#$""$'%$+8+)#9'I&A'Q+99".'&+/'
cB;"#$BA9.'.#?*&:"$'#).'&"$'YB;"#$BA9.'P$+").O'I+:&'/%#$%"9;'DDD&– !"##$)%&''(O'I&A'+/'#%%?/".'AP'Q+99+)*'&+/'cB;"#$BA9.'.#?*&:"$'#).'&"$'K"/:'DDD'
m)'<".)"/.#;O'#'X?.*"'.")+".'K#+9'PA$'!"##$)%&''(O'`5O'R&– C?*+DE"/'x'!"##$)%&''(&B'T$+"P'B'C#:&"$'S")+".'T#+9'-I#+:/'G$+#9'CA$'2&+9.$")'/'
@?$."$/'\"$$;'T$#):A)'oAKK/'#%%?/".'AP':&"'/:#KK+)*'."#:&/'DDD&
• !K83/*5&9%&– !"##$)%&''(O'-?:&A$,'-'_b&'*")"$#DA)'P#$8"$O'<#;)"'2$;:/'_)+/&".'
&#$E"/D)*'&+/'%$AJ'+)':&"'P#99'AP']YcM'#).'&#?9".'8A$"':&#)'`LMMM'K?/&"9/'AP'/A;K"#)/'DDDD&
57'
5c'
4K83/*5&3-2NN*/3&
!"##$)%&''('+/':&"'$#*"B_99".O'.A8"/D%B
#K?/+)*'%#$""$'%$+8+)#9'I&A'Q+99".'&+/''!"##$)%&''(O'I&A'+/'#%%?/".'AP'Q+99+)*'&+/'YB;"#$BA9.'.#?*&:"$'#).'&"$'K"/:'
S$,'!"##$'=,'%&''('+/'#'J$A8+)"):'$"/"#$%&"$'+)'
:&"'_"9./'AP'%A8J?:#DA)#9'9+)*?+/D%'
!"##$'=,'%&''(,'-..$"//0'1!234!4'5676'-.8+$#9:;'<#;'D'
!"##$)%&''(O'#'_b&'*")"$#DA)'P#$8"$O'
<#;)"'2$;:/'_)+/&".'&#$E"/D)*'&+/'%$AJ'
s1 s2 … sn
teach 2 0 … 7
kill 10 2 … 3
child 1 3 … 0 /*Y/&3-2NN*/&5*N5*3*-/(.6-&
3-2NN*/&32)2K(52/0&
!6KK*4.6-&6L&/*Y/&3-2NN*/3&46-/(2-2-F&/?*&-()*&6L&2-/*5*3/&
G"N:'!)+JJ":'="J$"/"):#DA)'
• G&"'%A):"N:'AP'"#%&'/)+JJ":'+/'$"J$"/"):".'K;'#'E"%:A$'I+:&'C'.+8")/+A)/'
• (#%&'.+8")/+A)'+).+%#:"/'I&":&"$'#'J#$D%?9#$'P"#:?$"'
A%%?$$".'+)':&"'%A):"N:'
– :&"'E#9?"'%#)'K"'K+)#$;O'P$"j?")%;'%A?):'":%,'
• G&"'P"#:?$"/'%#J:?$"':&"'%&#$#%:"$+/D%/'AP':&"'%A):"N:':A'K"'%9?/:"$".'
• 4):?+DE"9;O'E"%:A$/3%A):"N:/':&#:'/&#$"':&"'/#8"'
P"#:?$"/'I+99'K"'/+8+9#$':A'"#%&'A:&"$'
5Y'
2A):"N:/'Z+)J?:':"N:'/)+JJ":/^'
• 2):]0' S$,' !"##$& CD& %&''(' ZKA$)' L[' \#)?#$;' ]Y5L^' +/' #'J$A8+)"):' $"/"#$%&"$' +)' :&"' _"9./' AP' %A8J?:#DA)#9'9+)*?+/D%/O'.+/%A?$/"'#)#9;/+/O'#).'#$D_%+#9'
• 2):L0'!"##$)%&''(& +/' :&"'$#*"B_99".O'.A8"/D%B#K?/+)*'%#$""$'%$+8+)#9'I&A'Q+99".'&+/'cB;"#$BA9.'.#?*&:"$'#).'&"$'YB;"#$BA9.'P$+").O'I+:&'/%#$%"9;'DDD&
• 2):`0' !"##$)%&''(O'-?:&A$,'-'_b&'*")"$#DA)' P#$8"$O'<#;)"'2$;:/' _)+/&".' &#$E"/D)*' &+/' %$AJ' +)' :&"' P#99' AP' ]YcM' #).'&#?9".'8A$"':&#)'`LMMM'K?/&"9/'AP'/A;K"#)/'DDD&
• 2):50' !"##$) %&''(O' I&A' +/' #%%?/".' AP' Q+99+)*' &+/' YB;"#$BA9.'.#?*&:"$'#).'&"$'K"/:'DDD'm)'<".)"/.#;O'#'X?.*"'.")+".'K#+9'PA$'!"##$)%&''(O'`5O'DDD&
[M'
G"N:'!)+JJ":'C"#:?$"/'Z]^'
• 1)+*$#8'y'#'/+)*9"'IA$.':&#:'A%%?$/'8A$"'
:&#)'#'*+E")')?8K"$'AP'D8"/'
[]'
M2KK& (5.O42(K& 5*3*(54?*5& R& +(8F?/*5&
2):]0' M' ]' ]' M'
2):L0' ]' M' M' ]'
2):`0' M' M' M' M'
2):50' ]' M' M' ]'
K+)#$;'E#9?"/'
G"N:'!)+JJ":'C"#:?$"/'Z]^'
• 1)+*$#8'y'#'/+)*9"'IA$.':&#:'A%%?$/'8A$"'
:&#)'#'*+E")')?8K"$'AP'D8"/'
' ' ' ''
[L'
• 'Q+99' ' ']MMM'
• '#$D_%+#9' '[MM'
• '$"/"#$%&"$ 'LMM'
n'
• '.#?*&:"$ ']MM ''
P$"j?")%;'"/D8#:".'P$A8'%A$J?/'
M2KK& (5.O42(K& 5*3*(54?*5& R& +(8F?/*5&
2):]0' M' [MM' LMM' M'
2):L0' ]MMM' M' M' ]MM'
2):`0' M' M' M' M'
2):50' ]MMM' M' M' ]MM'
P$"j?")%;'E#9?"/'
G"N:'!)+JJ":'C"#:?$"/'ZL^'
• T+*$#8y'#)'A$."$".'J#+$'AP'IA$./':&#:'A%%?$'
:A*":&"$'8A$"'Ab")':&#)'"NJ"%:".'K;'%&#)%"'
[`'
M2KK&?23& N56)2-*-/&5*3*(54?*5& 452)2-(K&E?6& R& ZU0*(5U6K+&+(8F?/*5&
2):]0' M' ]' M' M'
2):L0' ]' M' ]' ]'
2):`0' M' M' M' M'
2):50' ]' M' M' ]'
K+)#$;'E#9?"/'
G"N:'!)+JJ":'C"#:?$"/'ZL^'• T+*$#8y'#)'A$."$".'J#+$'AP'IA$./':&#:'A%%?$'
:A*":&"$'8A$"'Ab")':&#)'"NJ"%:".'K;'%&#)%"'
[5'
M2KK&?23& N56)2-*-/&5*3*(54?*5& 452)2-(K&E?6& R& ZU0*(5U6K+&+(8F?/*5&
2):]0' M' ]ML,Y' M' M'
2):L0' L],L' M' 6c,[' `[,Y'
2):`0' M' M' M' M'
2):50' L],L' M' M' `[,Y'
P$"j?")%;'I"+*&:/'
• 'Q+99'&+/ ' ' ' ''''L],L'
• 'J$A8+)"):'$"/"#$%&"$ '''']ML,Y'
• '%$+8+)#9'I&A' ' ''''6c,['
n'
• 'cB;"#$BA9.'.#?*&:"$ '''`[,Y'
''''''''''''''''''''O'9A*B9+Q"9+&AA.'/%A$"/'K#/".'A)'
P$"j?")%;'"/D8#:".'P$A8'%A$J?/'
!
"logP(w1 |w0)
G"N:'!)+JJ":'W$A?J+)*'
• *$A?J':"N:'/)+JJ":/'K;'/+8+9#$'8"#)+)*'
• /)+JJ":'/+8+9#$+:;'+/'%#9%?9#:".'#/''
[['
M2KK& (5.O42(K& 5*3*(54?*5& +(8F?/*5&
2):]0' M' ]' ]' M'
2):L0' ]' M' M' ]'
2):`0' M' M' M' M'
2):50' ]' M' M' ]'
!
sim(Cnt1,Cnt2) = w1i *w2ii=1
n"
/+8Z2):]O2):L^eZMl]^tZ]lM^tZ]lM^tZMl]^eM'
/+8Z2):]O2):`^eZMlM^tZ]lM^tZ]lM^tZMlM^eM'
/+8Z2):]O2):5^eZMl]^tZ]lM^tZ]lM^tZMl]^eM'
/+8Z2):LO2):`^eZ]lM^tZMlM^tZMlM^tZMlM^eM'
/+8Z2):LO2):5^eZ]l]^tZMlM^tZMlM^tZ]l]^eL'
/+8Z2):`O2):5^eZMl]^tZMlM^tZMlM^tZMl]^eM'
C+)#9'm?:J?:'
• (#%&'%9?/:"$'%A)/+/:/'AP'#'%"$:#+)')?8K"$'AP'1"B1)(,8<<"1(D)8E"E'/8#99':"N:'P$#*8"):/,'
• G&"'%9?/:"$/'%A$$"/JA).':A':&"'.+v"$"):'J"AJ9"'/&#$+)*':&"'/#8"')#8"'
– 29?/:"$]0'\"$$;'oAKK/':&"'#"("3#4*"#)– 29?/:"$L0'\"$$;'oAKK/':&"'C866"#)– 29?/:"$`0'\"$$;'oAKK/':&"'(8,>"#)
[6'
<"K'p"AJ9"'!"#$%&'2c")*"'
• G&"'_$/:'%c")*"'I#/'A$*#)+q".'+)'LMM7'
• <"p!'PA%?/"/'A)'J"$/A)'#).'A$*#)+q#DA)')#8"'
.+/#8K+*?#DA)'AP'<"K'J#*"/'
• CA$'"#%&'#8K+*?A?/')#8"O':&"'/;/:"8'8?/:'$":?$)':&"'
.A%?8"):/'#).':&"'#V$+K?:"/'I&+%&'#$"'$"9"E#):'PA$':&"'
.+v"$"):'/")/"/'AP':&"')#8"'
• U#/:'/?%&'%c")*"'I#/'A)']/:'AP'\?9;'LM]M'
• @A$"'+)PA$8#DA)'#:0''&VJ033)9J,?)".,"/3I"J/3'
[7'
d#8"'S+/%$+8+)#DA)'S"8A'
• !")/"29?/:"$/'K;'G".'p"."$/")'&VJ0338#$+8K#,.,?8),".?3%*+BK+)3!2B%*+3+)."N,%*+'
• G&"'/AbI#$"'%#)'K"'?/".'PA$0'– J$AJ"$')#8"'.+/%$+8+)#DA)'
– IA$.'/")/"'.+/%$+8+)#DA)'
– "B8#+9'%9?/:"$+)*'
– /;)A);8'_).+)*'
[c'
<&#:'IA?9.';A?'.A'I+:&'%9?/:"$+)*a'
• (B8#+9'%9?/:"$+)*'K;':AJ+%'
• m$*#)+q"'.A%?8"):/'+):A'8?9DJ9"'%#:"*A$+"/'
9+Q"'WAA*9"')"I/'
• G$#%"'I&#:':IA'J"AJ9"':#9Q'A)'GI+V"$'• !")D8"):'#)#9;/+/''
• n'
[Y'