<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
  <controlfield tag="001">162445</controlfield>
  <controlfield tag="005">20251017144634.0</controlfield>
  <datafield tag="024" ind1="7" ind2=" ">
    <subfield code="2">doi</subfield>
    <subfield code="a">10.1109/ICASSP39728.2021.9414859</subfield>
  </datafield>
  <datafield tag="024" ind1="8" ind2=" ">
    <subfield code="2">sideral</subfield>
    <subfield code="a">129899</subfield>
  </datafield>
  <datafield tag="037" ind1=" " ind2=" ">
    <subfield code="a">ART-2021-129899</subfield>
  </datafield>
  <datafield tag="041" ind1=" " ind2=" ">
    <subfield code="a">eng</subfield>
  </datafield>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Mingote, Victoria</subfield>
    <subfield code="u">Universidad de Zaragoza</subfield>
    <subfield code="0">(orcid)0000-0002-3505-0249</subfield>
  </datafield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">Memory Layers with Multi-Head Attention Mechanisms for Text-Dependent Speaker Verification</subfield>
  </datafield>
  <datafield tag="260" ind1=" " ind2=" ">
    <subfield code="c">2021</subfield>
  </datafield>
  <datafield tag="506" ind1="0" ind2=" ">
    <subfield code="a">Access copy available to the general public</subfield>
    <subfield code="f">Unrestricted</subfield>
  </datafield>
  <datafield tag="520" ind1="3" ind2=" ">
    <subfield code="a">In this paper, we explore an approach based on memory layers and multi-head attention mechanisms to improve in an efficient way the performance of text-dependent speaker verification (SV) systems. The most extended SV systems based on Deep Neural Networks (DNN) extract the embedding of the utterance from the average pooling of the temporal dimension after processing. Unlike previous works, we can exploit the phonetic knowledge needed for text-dependent SV systems by combining the temporal attention of multiple parallel heads with the phonetic embeddings extracted from a phonetic classification network, which helps to guide to the attention mechanism with the role of the positional embedding. The addition of a memory layer to a text-dependent SV system was tested on the RSR2015-part II and DeepMine-part I databases, where, in both cases outperformed the baseline result and the reference system based on the same transformer network without the memory layer.</subfield>
  </datafield>
  <datafield tag="536" ind1=" " ind2=" ">
    <subfield code="9">info:eu-repo/grantAgreement/ES/DGA/T36-20R</subfield>
    <subfield code="9">info:eu-repo/grantAgreement/ES/MINECO/TIN2017-85854-C4-1-R</subfield>
  </datafield>
  <datafield tag="540" ind1=" " ind2=" ">
    <subfield code="9">info:eu-repo/semantics/openAccess</subfield>
    <subfield code="a">All rights reserved</subfield>
    <subfield code="u">http://www.europeana.eu/rights/rr-f/</subfield>
  </datafield>
  <datafield tag="655" ind1=" " ind2="4">
    <subfield code="a">info:eu-repo/semantics/article</subfield>
    <subfield code="v">info:eu-repo/semantics/acceptedVersion</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="a">Miguel, Antonio</subfield>
    <subfield code="u">Universidad de Zaragoza</subfield>
    <subfield code="0">(orcid)0000-0001-5803-4316</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="a">Ortega, Alfonso</subfield>
    <subfield code="u">Universidad de Zaragoza</subfield>
    <subfield code="0">(orcid)0000-0002-3886-7748</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="a">Lleida, Eduardo</subfield>
    <subfield code="u">Universidad de Zaragoza</subfield>
    <subfield code="0">(orcid)0000-0001-9137-4013</subfield>
  </datafield>
  <datafield tag="710" ind1="2" ind2=" ">
    <subfield code="1">5008</subfield>
    <subfield code="2">800</subfield>
    <subfield code="a">Universidad de Zaragoza</subfield>
    <subfield code="b">Dpto. Ingeniería Electrón.Com.</subfield>
    <subfield code="c">Área Teoría Señal y Comunicac.</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="g">2021 (2021), 6154-6158</subfield>
    <subfield code="t">Proceedings - ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing</subfield>
    <subfield code="x">0736-7791</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">241617</subfield>
    <subfield code="u">http://zaguan.unizar.es/record/162445/files/texto_completo.pdf</subfield>
    <subfield code="y">Postprint</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">2968788</subfield>
    <subfield code="u">http://zaguan.unizar.es/record/162445/files/texto_completo.jpg?subformat=icon</subfield>
    <subfield code="x">icon</subfield>
    <subfield code="y">Postprint</subfield>
  </datafield>
  <datafield tag="909" ind1="C" ind2="O">
    <subfield code="o">oai:zaguan.unizar.es:162445</subfield>
    <subfield code="p">articulos</subfield>
    <subfield code="p">driver</subfield>
  </datafield>
  <datafield tag="951" ind1=" " ind2=" ">
    <subfield code="a">2025-10-17-14:28:13</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">ARTICLE</subfield>
  </datafield>
</record>
</collection>