neobio-0.0.20030929/ 0002755 0002656 0002032 00000000000 11662262520 012663 5 ustar tillea admin neobio-0.0.20030929/META-INF/ 0002755 0002656 0002032 00000000000 07727747074 014046 5 ustar tillea admin neobio-0.0.20030929/META-INF/MANIFEST.MF 0000644 0002656 0002032 00000000107 07727747074 015474 0 ustar tillea admin Manifest-Version: 1.0 Created-By: 1.4.0_01 (Sun Microsystems Inc.) neobio-0.0.20030929/src/ 0002755 0002656 0002032 00000000000 11662262520 013452 5 ustar tillea admin neobio-0.0.20030929/src/neobio/ 0002755 0002656 0002032 00000000000 11662262520 014725 5 ustar tillea admin neobio-0.0.20030929/src/neobio/gui/ 0002755 0002656 0002032 00000000000 11662262520 015511 5 ustar tillea admin neobio-0.0.20030929/src/neobio/gui/NeoBio.java 0000644 0002656 0002032 00000014561 07727746054 017554 0 ustar tillea admin /* * NeoBio.java * * Copyright 2003 Sergio Anibal de Carvalho Junior * * This file is part of NeoBio. * * NeoBio is free software; you can redistribute it and/or modify it under the terms of * the GNU General Public License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with NeoBio; * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, * Boston, MA 02111-1307, USA. * * Proper attribution of the author as the source of the software would be appreciated. * * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net * Department of Computer Science http://www.dcs.kcl.ac.uk * King's College London, UK http://www.kcl.ac.uk * * Please visit http://neobio.sourceforge.net * * This project was supervised by Professor Maxime Crochemore. * */ package neobio.gui; import java.io.*; import java.awt.*; import java.awt.event.*; import javax.swing.*; import javax.swing.border.*; import java.applet.*; import java.util.*; import java.net.URL; import java.beans.PropertyVetoException; /** * This class is a simple GUI utility for computing pairwise sequence alignments using one * of the the algorithms provided in the {@link neobio.alignment} package. * * @author Sergio A. de Carvalho Jr. */ public class NeoBio extends JFrame { private JMenu file_menu, help_menu; private JMenuBar menu_bar; private JMenuItem new_alignment_item, exit_menuitem, about_menuitem; private JSeparator mid_separator; private JToolBar file_toolbar; private JPanel toolbar_panel; private JButton alignment_button; private JDesktopPane desktop_pane; /** * Creates a new instance of a graphical interface. */ public NeoBio() { super(); setTitle("NeoBio"); setDefaultCloseOperation (WindowConstants.DISPOSE_ON_CLOSE); initComponents(); show(); // always open pairwise alignment internal frame pairwiseAlignment(); } private void initComponents() { URL icon; // window closing event addWindowListener(new WindowAdapter() { public void windowClosing(WindowEvent e) { exitForm(); } }); Container content_pane = getContentPane(); desktop_pane = new JDesktopPane(); content_pane.add (desktop_pane, BorderLayout.CENTER); new_alignment_item = new JMenuItem("Pairwise Alignment"); new_alignment_item.setMnemonic('p'); new_alignment_item.addActionListener(new ActionListener() { public void actionPerformed(ActionEvent e) { newAlignmentActionPerformed(e); } }); icon = getClass().getResource("icons/alignment.gif"); if (icon != null) new_alignment_item.setIcon(new ImageIcon(icon)); mid_separator = new JSeparator(); exit_menuitem = new JMenuItem("Exit"); exit_menuitem.setMnemonic('x'); exit_menuitem.addActionListener(new ActionListener() { public void actionPerformed(ActionEvent e) { exitMenuItemActionPerformed(e); } }); file_menu = new JMenu("File"); file_menu.setMnemonic('f'); file_menu.add(new_alignment_item); file_menu.add(mid_separator); file_menu.add(exit_menuitem); about_menuitem = new JMenuItem("About"); about_menuitem.addActionListener(new ActionListener() { public void actionPerformed(ActionEvent e) { aboutMenuItemActionPerformed(e); } }); icon = getClass().getResource("icons/help.gif"); if (icon != null) about_menuitem.setIcon(new ImageIcon(icon)); help_menu = new JMenu("Help"); help_menu.add(about_menuitem); menu_bar = new JMenuBar(); //menu_bar.setFont(getFont()); menu_bar.add(file_menu); menu_bar.add(help_menu); setJMenuBar(menu_bar); alignment_button = new JButton(); alignment_button.setMnemonic('p'); alignment_button.setToolTipText("Pairwise Alignment..."); alignment_button.addActionListener(new ActionListener() { public void actionPerformed(ActionEvent e) { newAlignmentActionPerformed(e); } }); icon = getClass().getResource("icons/alignment.gif"); if (icon != null) alignment_button.setIcon(new ImageIcon(icon)); file_toolbar = new JToolBar(); file_toolbar.setRollover(true); file_toolbar.add(alignment_button); toolbar_panel = new JPanel(); toolbar_panel.setLayout(new FlowLayout(FlowLayout.LEFT, 0, 0)); toolbar_panel.setBorder(new EtchedBorder()); toolbar_panel.add(file_toolbar); content_pane.add(toolbar_panel, BorderLayout.NORTH); // set frame size Dimension screen = Toolkit.getDefaultToolkit().getScreenSize(); setSize((screen.width * 2) / 3, (screen.height * 7) / 8); setLocation(screen.width / 6, screen.height / 16); } private void aboutMenuItemActionPerformed (ActionEvent e) { (new AboutDialog(this)).show(); } private void exitMenuItemActionPerformed (ActionEvent e) { exitForm(); } private void exitForm () { System.exit(0); } private void newAlignmentActionPerformed (ActionEvent e) { pairwiseAlignment(); } private void pairwiseAlignment () { PairwiseAlignmentFrame alignment_frame = new PairwiseAlignmentFrame (this); desktop_pane.add (alignment_frame); alignment_frame.setBounds(0, 0, 500, 500); alignment_frame.show(); alignment_frame.toFront(); try { alignment_frame.setMaximum (true); } catch (PropertyVetoException e) {} } /** * Create and run a new interface. The main method takes no parameter from the * command line. * * @param args command line arguments */ public static void main(String args[]) { new NeoBio(); } } neobio-0.0.20030929/src/neobio/gui/AboutDialog.java 0000644 0002656 0002032 00000004721 07727746344 020572 0 ustar tillea admin /* * About.java * * Copyright 2003 Sergio Anibal de Carvalho Junior * * This file is part of NeoBio. * * NeoBio is free software; you can redistribute it and/or modify it under the terms of * the GNU General Public License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with NeoBio; * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, * Boston, MA 02111-1307, USA. * * Proper attribution of the author as the source of the software would be appreciated. * * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net * Department of Computer Science http://www.dcs.kcl.ac.uk * King's College London, UK http://www.kcl.ac.uk * * Please visit http://neobio.sourceforge.net * * This project was supervised by Professor Maxime Crochemore. * */ package neobio.gui; import java.awt.*; import java.awt.event.*; import javax.swing.*; import java.net.URL; /** * About screen. * * @author Sergio A. de Carvalho Jr. */ public class AboutDialog extends JDialog { private JLabel image_label; /** * Creates a new instance of the About screen. * * @param parent the parent frame */ public AboutDialog (Frame parent) { super (parent, true); initComponents (); pack (); } private void initComponents () { URL image_filename; setTitle ("About"); setResizable (false); setDefaultCloseOperation (WindowConstants.DISPOSE_ON_CLOSE); addWindowListener (new WindowAdapter () { public void windowClosing (WindowEvent e) { closeDialog (e); } }); image_filename = getClass().getResource("icons/about.jpg"); if (image_filename != null) { image_label = new JLabel (); image_label.setIcon(new ImageIcon(image_filename)); getContentPane().add(image_label, BorderLayout.CENTER); } } private void closeDialog(WindowEvent e) { setVisible (false); dispose (); } } neobio-0.0.20030929/src/neobio/gui/PairwiseAlignmentFrame.java 0000644 0002656 0002032 00000046605 07727746352 023003 0 ustar tillea admin /* * PairwiseAlignmentFrame.java * * Copyright 2003 Sergio Anibal de Carvalho Junior * * This file is part of NeoBio. * * NeoBio is free software; you can redistribute it and/or modify it under the terms of * the GNU General Public License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with NeoBio; * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, * Boston, MA 02111-1307, USA. * * Proper attribution of the author as the source of the software would be appreciated. * * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net * Department of Computer Science http://www.dcs.kcl.ac.uk * King's College London, UK http://www.kcl.ac.uk * * Please visit http://neobio.sourceforge.net * * This project was supervised by Professor Maxime Crochemore. * */ package neobio.gui; import neobio.alignment.*; import java.io.*; import java.awt.*; import java.awt.event.*; import javax.swing.*; import javax.swing.border.*; import javax.swing.event.*; /** * This class is the internal frame of NeoBio's graphical interface for computing pairwise * sequence alignments using one of the the algorithms provided in the {@link * neobio.alignment} package. * * @author Sergio A. de Carvalho Jr. */ public class PairwiseAlignmentFrame extends JInternalFrame { private static int window_number = 1; private Frame parent_frame; private JPanel input_panel, scoring_panel, algorithm_panel, output_panel; private JPanel progress_tab_panel, output_tab_panel; private JTextField seq1_field, seq2_field, matrix_field, output_field; private JTextField match_field, mismatch_field, gap_field; private JTextArea progress_area, output_area; private JButton find_seq1_button, find_seq2_button, find_output_button; private JButton find_matrix_button, run_button; private JComboBox algorithm_combo; private JTabbedPane output_tab; private JRadioButton screen_button, file_button, basic_button, matrix_button; private ButtonGroup scoring_group, output_group; private JLabel seq1_label, seq2_label; private JLabel match_label, mismatch_label, gap_label; private JFileChooser find_dialog; private boolean output_to_file, basic_scheme; private String[] algorithm_name = {"Needleman & Wunsch (global alignment)", "Smith & Waterman (local alignment)", "Crochemore, Landau & Ziv-Ukelson for global alignment", "Crochemore, Landau & Ziv-Ukelson for local alignment"}; private PairwiseAlignmentAlgorithm[] algorithm = {new NeedlemanWunsch(), new SmithWaterman(), new CrochemoreLandauZivUkelsonGlobalAlignment(), new CrochemoreLandauZivUkelsonLocalAlignment()}; /** * Creates a new instance of the internal frame. * * @param parent_frame the parent frame */ public PairwiseAlignmentFrame (Frame parent_frame) { this.parent_frame = parent_frame; initComponents(); } private void initComponents() { JComponent pane; GridBagConstraints c; setIconifiable(true); setMaximizable(true); setResizable(true); setClosable(true); setTitle("Pairwise Sequence Alignment " + window_number++); setMinimumSize(new Dimension(500, 500)); pane = (JComponent) getContentPane(); pane.setLayout(new GridBagLayout()); c = new GridBagConstraints(); c.insets = new Insets (4, 4, 4, 4); c.fill = GridBagConstraints.BOTH; c.weightx = 1.0; c.weighty = 0; // input panel input_panel = new JPanel (); add (pane, input_panel, c, 0, 0); // scoring panel scoring_panel = new JPanel (); add (pane, scoring_panel, c, 0, 1); // output panel output_panel = new JPanel (); add (pane, output_panel, c, 0, 2); // algorithm panel algorithm_panel = new JPanel (); add (pane, algorithm_panel, c, 0, 3); c.weightx = 1.0; c.weighty = 1.0; // output tab output_tab = new JTabbedPane(); add (pane, output_tab, c, 0, 4); find_dialog = new JFileChooser(); find_dialog.setDialogTitle("Find..."); find_dialog.setDialogType(JFileChooser.OPEN_DIALOG ); // ***************** INPUT PANEL ***************** input_panel.setLayout(new GridBagLayout()); input_panel.setBorder(BorderFactory.createTitledBorder(new EtchedBorder( EtchedBorder.LOWERED), "Input")); seq1_label = new JLabel("Sequence 1:"); seq2_label = new JLabel("Sequence 2:"); seq1_field = new JTextField(); seq1_field.addCaretListener (new CaretListener() { public void caretUpdate (CaretEvent e) { checkRunButtonStatus (); } }); seq2_field = new JTextField(); seq2_field.addCaretListener (new CaretListener() { public void caretUpdate (CaretEvent e) { checkRunButtonStatus (); } }); find_seq1_button = new JButton("Find..."); find_seq1_button.addActionListener (new ActionListener() { public void actionPerformed (ActionEvent e) { findSeq1ButtonActionPerformed(); } }); find_seq2_button = new JButton("Find..."); find_seq2_button.addActionListener (new ActionListener() { public void actionPerformed (ActionEvent e) { findSeq2ButtonActionPerformed(); } }); c.weightx = 0; c.weighty = 0; c.anchor = GridBagConstraints.EAST; add (input_panel, seq1_label, c, 0, 0); add (input_panel, seq2_label, c, 0, 1); c.anchor = GridBagConstraints.CENTER; add (input_panel, find_seq1_button, c, 2, 0); add (input_panel, find_seq2_button, c, 2, 1); c.weightx = 1.0; c.fill = GridBagConstraints.HORIZONTAL; add (input_panel, seq1_field, c, 1, 0); add (input_panel, seq2_field, c, 1, 1); // ***************** SCORING SCHEME PANEL ***************** scoring_panel.setLayout(new GridBagLayout()); scoring_panel.setBorder(BorderFactory.createTitledBorder(new EtchedBorder( EtchedBorder.LOWERED), "Scoring Scheme")); basic_scheme = true; basic_button = new JRadioButton("Basic:"); basic_button.setSelected(true); basic_button.addItemListener (new ItemListener() { public void itemStateChanged (ItemEvent e) { schemeOptionStateChanged(); } }); matrix_button = new JRadioButton("Substitution Matrix:"); matrix_button.addItemListener (new ItemListener() { public void itemStateChanged (ItemEvent e) { schemeOptionStateChanged(); } }); match_label = new JLabel("Match:"); mismatch_label = new JLabel ("Mismatch:"); gap_label = new JLabel ("Gap:"); match_field = new JTextField("1", 2); match_field.setHorizontalAlignment(JTextField.RIGHT); match_field.addCaretListener (new CaretListener() { public void caretUpdate (CaretEvent e) { checkRunButtonStatus (); } }); mismatch_field = new JTextField("-1", 2); mismatch_field.setHorizontalAlignment(JTextField.RIGHT); mismatch_field.addCaretListener (new CaretListener() { public void caretUpdate (CaretEvent e) { checkRunButtonStatus (); } }); gap_field = new JTextField("-1", 2); gap_field.setHorizontalAlignment(JTextField.RIGHT); gap_field.addCaretListener (new CaretListener() { public void caretUpdate (CaretEvent e) { checkRunButtonStatus (); } }); matrix_field = new JTextField(); matrix_field.setEnabled(false); matrix_field.addCaretListener (new CaretListener() { public void caretUpdate (CaretEvent e) { checkRunButtonStatus (); } }); find_matrix_button = new JButton("Find..."); find_matrix_button.setEnabled(false); find_matrix_button.addActionListener (new ActionListener() { public void actionPerformed (ActionEvent e) { findMatrixButtonActionPerformed(); } }); scoring_group = new ButtonGroup (); scoring_group.add(basic_button); scoring_group.add(matrix_button); c.weightx = 0; c.fill = GridBagConstraints.NONE; c.anchor = GridBagConstraints.WEST; add (scoring_panel, basic_button, c, 0, 0); c.anchor = GridBagConstraints.EAST; add (scoring_panel, match_label, c, 1, 0); add (scoring_panel, mismatch_label, c, 3, 0); add (scoring_panel, gap_label, c, 5, 0); c.anchor = GridBagConstraints.WEST; add (scoring_panel, matrix_button, c, 0, 1); c.anchor = GridBagConstraints.CENTER; add (scoring_panel, find_matrix_button, c, 7, 1); c.weightx = 1.0 / 3; c.fill = GridBagConstraints.HORIZONTAL; add (scoring_panel, match_field, c, 2, 0); add (scoring_panel, mismatch_field, c, 4, 0); add (scoring_panel, gap_field, c, 6, 0); c.weightx = 1.0; c.gridwidth = 6; add (scoring_panel, matrix_field, c, 1, 1); c.gridwidth = 1; // ***************** OUTPUT PANEL ***************** output_panel.setLayout(new GridBagLayout()); output_panel.setBorder(BorderFactory.createTitledBorder(new EtchedBorder( EtchedBorder.LOWERED), "Output")); screen_button = new JRadioButton("Screen"); screen_button.setSelected(true); output_to_file = false; file_button = new JRadioButton("File:"); file_button.addItemListener (new ItemListener() { public void itemStateChanged (ItemEvent e) { outputOptionStateChanged(); } }); output_field = new JTextField(); output_field.setEnabled(false); output_field.addCaretListener (new CaretListener() { public void caretUpdate (CaretEvent e) { checkRunButtonStatus (); } }); find_output_button = new JButton("Find..."); find_output_button.setEnabled(false); find_output_button.addActionListener (new ActionListener() { public void actionPerformed (ActionEvent e) { findOutputButtonActionPerformed(); } }); output_group = new ButtonGroup (); output_group.add(screen_button); output_group.add(file_button); c.weightx = 0; c.weighty = 0; c.fill = GridBagConstraints.NONE; add (output_panel, screen_button, c, 0, 0); add (output_panel, file_button, c, 1, 0); add (output_panel, find_output_button, c, 3, 0); c.weightx = 1.0; c.weighty = 0; c.fill = GridBagConstraints.HORIZONTAL; add (output_panel, output_field, c, 2, 0); // ***************** ALGORITHM PANEL ***************** algorithm_panel.setLayout(new GridBagLayout()); algorithm_panel.setBorder(BorderFactory.createTitledBorder(new EtchedBorder( EtchedBorder.LOWERED), "Alignment Algorithm")); algorithm_combo = new JComboBox(algorithm_name); run_button = new JButton("Run"); run_button.setEnabled(false); run_button.addActionListener (new ActionListener() { public void actionPerformed (ActionEvent e) { runButtonActionPerformed(); } }); c.weightx = 1.0; c.weighty = 0; c.fill = GridBagConstraints.HORIZONTAL; add (algorithm_panel, algorithm_combo, c, 0, 0); c.weightx = 0; c.weighty = 0; c.fill = GridBagConstraints.NONE; add (algorithm_panel, run_button, c, 1, 0); // ***************** OUTPUT TAB ***************** progress_area = new JTextArea (); progress_area.setEditable (false); progress_area.setBorder (BorderFactory.createBevelBorder(BevelBorder.LOWERED)); progress_tab_panel = new JPanel (); progress_tab_panel.setLayout (new GridLayout()); progress_tab_panel.add (new JScrollPane (progress_area)); output_tab.addTab ("Progress", progress_tab_panel); output_area = new JTextArea (); output_area.setEditable (false); output_area.setBorder (BorderFactory.createBevelBorder(BevelBorder.LOWERED)); output_area.setFont (new Font("Monospaced", Font.PLAIN, 12)); output_tab_panel = new JPanel(); output_tab_panel.setLayout (new GridLayout()); output_tab_panel.add (new JScrollPane (output_area)); output_tab.addTab ("Output", output_tab_panel); } private void add (JComponent a, JComponent b, GridBagConstraints c, int x, int y) { c.gridx = x; c.gridy = y; a.add (b, c); } private void findSeq1ButtonActionPerformed () { int c = find_dialog.showOpenDialog (this); if (c != JFileChooser.APPROVE_OPTION) return; seq1_field.setText (find_dialog.getSelectedFile().getPath()); } private void findSeq2ButtonActionPerformed () { int c = find_dialog.showOpenDialog (this); if (c != JFileChooser.APPROVE_OPTION) return; seq2_field.setText (find_dialog.getSelectedFile().getPath()); } private void findMatrixButtonActionPerformed () { int c = find_dialog.showOpenDialog (this); if (c != JFileChooser.APPROVE_OPTION) return; matrix_field.setText (find_dialog.getSelectedFile().getPath()); } private void findOutputButtonActionPerformed () { int c = find_dialog.showOpenDialog (this); if (c != JFileChooser.APPROVE_OPTION) return; output_field.setText (find_dialog.getSelectedFile().getPath()); } private void schemeOptionStateChanged () { basic_scheme = basic_button.isSelected(); match_label.setEnabled(basic_scheme); match_field.setEnabled(basic_scheme); mismatch_label.setEnabled(basic_scheme); mismatch_field.setEnabled(basic_scheme); gap_label.setEnabled(basic_scheme); gap_field.setEnabled(basic_scheme); matrix_field.setEnabled (!basic_scheme); find_matrix_button.setEnabled (!basic_scheme); checkRunButtonStatus(); } private void outputOptionStateChanged () { output_to_file = file_button.isSelected(); output_field.setEnabled (output_to_file); find_output_button.setEnabled (output_to_file); checkRunButtonStatus(); } private void checkRunButtonStatus () { boolean run = true; if (seq1_field.getText().length() == 0 || seq2_field.getText().length() == 0) { run = false; } else { if (file_button.isSelected() && output_field.getText().length() == 0) { run = false; } else { if (matrix_button.isSelected()) { if (matrix_field.getText().length() == 0) { run = false; } } else { if (match_field.getText().length() == 0 || mismatch_field.getText().length() == 0 || gap_field.getText().length() == 0) { run = false; } } } } if ((run_button.isEnabled() && !run) || (!run_button.isEnabled() && run)) run_button.setEnabled(run); } private void runButtonActionPerformed () { ScoringScheme scoring; PairwiseAlignment alignment; FileReader seq1_file, seq2_file, matrix_file; BufferedWriter output_file; String seq1_filename, seq2_filename; String matrix_filename, output_filename, message; int alg, match, mismatch, gap; long start, elapsed; alg = algorithm_combo.getSelectedIndex(); output_tab.setSelectedIndex(0); output_area.setText (""); // ***************** SET SCORING SCHEME ***************** if (basic_scheme) { progress_area.setText ("Creating scoring scheme... "); try { match = Integer.parseInt (match_field.getText()); mismatch = Integer.parseInt (mismatch_field.getText()); gap = Integer.parseInt (gap_field.getText()); scoring = new BasicScoringScheme (match, mismatch, gap); algorithm[alg].setScoringScheme(scoring); progress_area.append ("OK"); } catch (NumberFormatException e) { message = "Invalid scoring arguments."; progress_area.append ("\n" + message); showError (message); return; } } else { matrix_filename = matrix_field.getText (); progress_area.setText ("Loading matrix file... "); try { matrix_file = new FileReader (matrix_filename); } catch (FileNotFoundException e) { message = "File \"" + matrix_filename + "\" not found."; progress_area.append("\n" + message); showError (message); return; } try { try { scoring = new ScoringMatrix (matrix_file); algorithm[alg].setScoringScheme(scoring); progress_area.append ("OK"); } catch (InvalidScoringMatrixException e) { matrix_file.close(); message = "Invalid matrix file \"" + matrix_filename + "\"."; progress_area.append ("\n" + message); showError (message); return; } matrix_file.close(); } catch (IOException e) { message = "Error reading file."; progress_area.append("\n" + message); showError (message); return; } } // ***************** LOAD SEQUENCES ***************** progress_area.append ("\n\nLoading sequences... "); seq1_filename = seq1_field.getText (); try { seq1_file = new FileReader (seq1_filename); } catch (FileNotFoundException e) { message = "File \"" + seq1_filename +"\" not found."; progress_area.append("\n" + message); showError (message); return; } seq2_filename = seq2_field.getText (); try { seq2_file = new FileReader (seq2_filename); } catch (FileNotFoundException e) { message = "File \"" + seq2_filename +"\" not found."; progress_area.append("\n" + message); showError (message); return; } try { try { start = System.currentTimeMillis(); algorithm[alg].loadSequences (seq1_file, seq2_file); elapsed = System.currentTimeMillis() - start; progress_area.append ("OK"); progress_area.append ("\n[ Elapsed time: " + elapsed + " milliseconds ]"); } catch (InvalidSequenceException e) { seq1_file.close(); seq2_file.close(); message = "Invalid sequence files."; progress_area.append ("\n" + message); showError (message); return; } seq1_file.close(); seq2_file.close(); } catch (IOException e) { message = "Error reading sequence files."; progress_area.append("\n" + message); showError (message); return; } // ***************** EXECUTE ALGORITHM ***************** progress_area.append("\n\nRunning " + algorithm_combo.getSelectedItem() + "... "); try { start = System.currentTimeMillis(); alignment = algorithm[alg].getPairwiseAlignment(); elapsed = System.currentTimeMillis() - start; progress_area.append ("OK"); progress_area.append ("\n[ Elapsed time: " + elapsed + " milliseconds ]"); } catch (IncompatibleScoringSchemeException e) { message = "Scoring matrix is not compatible with loaded sequences."; progress_area.append ("\n" + message); showError (message); return; } catch (OutOfMemoryError e) { message = "Insufficient memory to compute an alignment"; progress_area.append ("\n" + message); showError (message); return; } // ***************** DISPLAY / SAVE OUTPUT ***************** if (output_to_file) { output_filename = output_field.getText (); progress_area.append ("\n\nSaving alignment... "); try { int length = alignment.getGappedSequence1().length(); output_file = new BufferedWriter(new FileWriter (output_filename)); output_file.write(alignment.getGappedSequence1(), 0, length); output_file.newLine(); output_file.write(alignment.getScoreTagLine(), 0, length); output_file.newLine(); output_file.write(alignment.getGappedSequence2(), 0, length); output_file.newLine(); String tmp = "Score: " + alignment.getScore(); output_file.write(tmp, 0, tmp.length()); output_file.close(); } catch (IOException e) { message = "Error writing file \"" + output_filename +"\"."; progress_area.append("\n" + message); showError (message); return; } progress_area.append ("OK"); } else { output_area.setText (alignment.toString()); output_tab.setSelectedIndex(1); } } private void showError (String message) { JOptionPane.showMessageDialog(this, message, "Error", JOptionPane.ERROR_MESSAGE); } } neobio-0.0.20030929/src/neobio/textui/ 0002755 0002656 0002032 00000000000 11662262520 016247 5 ustar tillea admin neobio-0.0.20030929/src/neobio/textui/NeoBio.java 0000644 0002656 0002032 00000017546 07727250206 020305 0 ustar tillea admin /* * NeoBio.java * * Copyright 2003 Sergio Anibal de Carvalho Junior * * This file is part of NeoBio. * * NeoBio is free software; you can redistribute it and/or modify it under the terms of * the GNU General Public License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with NeoBio; * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, * Boston, MA 02111-1307, USA. * * Proper attribution of the author as the source of the software would be appreciated. * * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net * Department of Computer Science http://www.dcs.kcl.ac.uk * King's College London, UK http://www.kcl.ac.uk * * Please visit http://neobio.sourceforge.net * * This project was supervised by Professor Maxime Crochemore. * */ package neobio.textui; import neobio.alignment.*; import java.io.FileReader; import java.io.IOException; /** * This class is a simple command line based utility for computing pairwise sequence * alignments using one of the the algorithms provided in the {@link neobio.alignment} * package. * *
The main method takes the follwing parameters from the command line:
*
*
*
*
* NeoBio <alg> <S1> <S2> [M <matrix> | S <match>
* <mismatch> <gap>]
*
<alg>
is either NW
for {@linkplain
* neobio.alignment.NeedlemanWunsch Needleman & Wunsch} (global alignment),
* SW
for {@linkplain neobio.alignment.SmithWaterman Smith & Waterman}
* (local alignment), CLZG
for {@linkplain
* neobio.alignment.CrochemoreLandauZivUkelsonGlobalAlignment Crochemore, Landau &
* Ziv-Ukelson global alignment} or CLZL
for {@linkplain
* neobio.alignment.CrochemoreLandauZivUkelsonLocalAlignment Crochemore, Landau &
* Ziv-Ukelson local alignment};
*
* <S1>
is the first sequence file;
*
* <S2>
is the second sequence file;
*
* M <matrix>
is for using a scoring matrix file;
*
* S <match> <mismatch> <gap>
is for using a
* simple scoring scheme, where <match>
is the match reward
* value, <mismatch>
is the mismatch penalty value and
* <gap>
is the cost of a gap (linear gap cost function).
* The main method takes three parameters from the command line to generate a
* sequence: type
, size
and file
, where:
*
type
is either DNA
for DNA sequences or
* PROT
for protein sequences.
* size
is the number os characters.
* file
(optional) is the name of a file (if ommited, sequence
* is written to standard output).
* The main method takes three parameters from the command line to generate a
* sequence: type
, size
and file
, where:
*
type
is either DNA
for DNA sequences or
* PROT
for protein sequences.
* size
is the number os characters.
* file
(optional) is the name of a file (if ommited, sequence
* is written to standard output).
* This implementation derives from the paper of M.Crochemore, G.Landau and * M.Ziv-Ukelson, A Sub-quadratic Sequence Alignment Algorithm for Unrestricted Scoring * Matrices (available here as * PDF or * Postscript).
* *For a general description of the algorithm, please refer to the specification of the * abstract {@linkplain CrochemoreLandauZivUkelson} superclass.
* *This class consist mainly of methods that:
* *This algorithm works essentially in the same way as the global alignment version. * The main differences is that an aptimal path can either be contained entirely in a * block (called C-path) or be a block-crossing path. A block-crossing path * consists of a (possibly empty) S-path (a path that starts inside a block and * ends in its output border), followed by any number of paths that cross a block from its * input border to its output border, and ending in an E-path (a path that starts * in the input border of a block and ends inside the block).
* *Therefore, it is necessary to compute extra information to keep track of these * possibilities. This is accomplished by using an instance of a {@linkplain * LocalAlignmentBlock} (which extends the {@linkplain AlignmentBlock} class) for every * block in the block table.
* * @see CrochemoreLandauZivUkelson * @see CrochemoreLandauZivUkelsonLocalAlignment * @author Sergio A. de Carvalho Jr. */ public class CrochemoreLandauZivUkelsonLocalAlignment extends CrochemoreLandauZivUkelson { /** * A constant that indicates that the best path ending at a given entry of the output * border is a block-crossing path (one that starts outside the block). */ protected static final byte TYPE_CROSSING_PATH = 0; /** * A constant that indicates that the best path ending at a given entry of the output * border is a S-path (one that starts inside the block). */ protected static final byte TYPE_S_PATH = 1; /** * A constant that indicates that the high scoring path ending in a given block is a * C-path, i.e. one that starts inside the block. */ protected static final byte TYPE_C_PATH = 2; /** * A constant that indicates that the high scoring path ending in a given block is an * E-path, i.e. one that starts at its input border. */ protected static final byte TYPE_E_PATH = 3; /** * The score of the high scoring local alignment found. */ protected int max_score; /** * The row index of a block (in the block table) where the high scoring local * alignment ends. */ protected int max_row; /** * The column index of a block (in the block table) where the high scoring local * alignment ends. */ protected int max_col; /** * The type of the high scoring local alignment found. */ protected byte max_path_type; /** * If the high scoring local alignment ends in an E-path at a block B, this field * contains the index of the entry in the input border of B that where the E-path * starts. */ protected int max_source_index; /** * Creates and computes all information of an alignment block. This method works * essentially in the same way as its global alignment counterpart. Its main job is to * compute the DIST column for the block. It then request the *computeOutputBorder
method to compute the block's output border. It
* also computes all S, C and E-paths of this block. Finally, it checks if the C-path
* of this block is higher than the highest score found so far.
*
* @param factor1 factor of the first sequence
* @param factor2 factor of the second sequence
* @param row row index of the block in the block table
* @param col column index of the block in the block table
* @return the computed block
* @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
* with the sequences being aligned
*/
protected AlignmentBlock createBlock (Factor factor1, Factor factor2, int row,
int col) throws IncompatibleScoringSchemeException
{
LocalAlignmentBlock block, left_prefix, diag_prefix, top_prefix;
int size, lr, lc, max, ins_E, del_E;
int score_ins, score_sub, score_del, ins, del, sub;
lr = factor1.length();
lc = factor2.length();
size = lr + lc + 1;
block = new LocalAlignmentBlock (factor1, factor2, size);
// retrieve pointers to prefixes
left_prefix = (LocalAlignmentBlock) getLeftPrefix (block);
diag_prefix = (LocalAlignmentBlock) getDiagonalPrefix (block);
top_prefix = (LocalAlignmentBlock) getTopPrefix (block);
// compute scores
score_ins = scoreInsertion (factor2.getNewChar());
score_sub = scoreSubstitution (factor1.getNewChar(), factor2.getNewChar());
score_del = scoreDeletion (factor1.getNewChar());
// compute block's data
for (int i = 0; i < size; i++)
{
ins = sub = del = ins_E = del_E = Integer.MIN_VALUE;
if (i < size - 1)
{
ins = left_prefix.dist_column[i] + score_ins;
ins_E = left_prefix.E_path_score[i];
}
if ((i > 0) && (i < size - 1))
{
sub = diag_prefix.dist_column[i - 1] + score_sub;
}
if (i > 0)
{
del = top_prefix.dist_column[i - 1] + score_del;
del_E = top_prefix.E_path_score[i - 1];
}
block.dist_column[i] = max = max (ins, sub, del);
if (max == ins)
block.direction[i] = LEFT_DIRECTION;
else if (max == sub)
block.direction[i] = DIAGONAL_DIRECTION;
else
block.direction[i] = TOP_DIRECTION;
block.E_path_score[i] = max = max (ins_E, block.dist_column[i], del_E);
if (max == ins_E)
{
block.E_path_ancestor[i] = left_prefix.E_path_ancestor[i];
block.E_path_ancestor_index[i] = left_prefix.E_path_ancestor_index[i];
}
else if (max == block.dist_column[i])
{
block.E_path_ancestor[i] = block;
block.E_path_ancestor_index[i] = i;
}
else
{
block.E_path_ancestor[i] = top_prefix.E_path_ancestor[i - 1];
block.E_path_ancestor_index[i] = top_prefix.E_path_ancestor_index[i - 1];
}
if (i < lc)
{
block.S_path_score[i] = left_prefix.S_path_score[i];
}
else if (i == lc)
{
ins = left_prefix.S_path_score[i-1] + score_ins;
sub = diag_prefix.S_path_score[i-1] + score_sub;
del = top_prefix.S_path_score[i] + score_del;
block.S_path_score[i] = max = max (0, ins, sub, del);
if (max == ins)
block.S_direction = LEFT_DIRECTION;
else if (max == sub)
block.S_direction = DIAGONAL_DIRECTION;
else if (max == del)
block.S_direction = TOP_DIRECTION;
else
block.S_direction = STOP_DIRECTION;
}
else
{
block.S_path_score[i] = top_prefix.S_path_score[i - 1];
}
}
computeOutputBorder (block, row, col, size, lc, lr);
ins = left_prefix.C;
del = top_prefix.C;
block.C = max = max (ins, block.S_path_score[lc], del);
if (block.C > max_score)
{
// assert block.C == block.S_path_score[lc]; => always true
max_score = block.C;
max_row = row;
max_col = col;
max_path_type = TYPE_C_PATH;
}
return block;
}
/**
* Creates the root block. This is a special case of the createBlock
* method. No information is actually computed.
*
* @param factor1 factor of the first sequence
* @param factor2 factor of the second sequence
* @return the root block
*/
protected AlignmentBlock createRootBlock (Factor factor1, Factor factor2)
{
// resets the variables that keep track
// of the high scoring alignment
max_row = max_col = max_score = 0;
max_path_type = TYPE_C_PATH;
return new LocalAlignmentBlock (factor1, factor2);
}
/**
* Creates and computes all information of an alignment block of the first column of
* the block table. This is a special case of the createBlock
method.
*
* @param factor1 factor of the first sequence
* @param factor2 factor of the second sequence
* @param col column index of the block in the block table
* @return the computed block
* @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
* with the sequences being aligned
* @see #createBlock createBlock
*/
protected AlignmentBlock createFirstRowBlock (Factor factor1, Factor factor2, int col)
throws IncompatibleScoringSchemeException
{
LocalAlignmentBlock block, left_prefix;
int size, lr, lc, score_ins;
lr = 0; // factor1.length();
lc = factor2.length();
size = lr + lc + 1;
block = new LocalAlignmentBlock (factor1, factor2, size);
// retrieve a pointer to left prefix
left_prefix = (LocalAlignmentBlock) getLeftPrefix (block);
// compute insertion's score
score_ins = scoreInsertion (factor2.getNewChar());
// compute block's data
for (int i = 0; i < lc; i++)
{
block.dist_column[i] = left_prefix.dist_column[i] + score_ins;
block.direction[i] = LEFT_DIRECTION;
block.S_path_score[i] = left_prefix.S_path_score[i];
block.E_path_score[i] = left_prefix.E_path_score[i];
block.E_path_ancestor[i] = left_prefix.E_path_ancestor[i];
block.E_path_ancestor_index[i] = left_prefix.E_path_ancestor_index[i];
if (block.dist_column[i] > block.E_path_score[i])
{
block.E_path_score[i] = block.dist_column[i];
block.E_path_ancestor[i] = block;
block.E_path_ancestor_index[i] = i;
}
}
// last position
block.E_path_score[lc] = block.dist_column[lc] = 0;
block.direction[lc] = STOP_DIRECTION;
block.E_path_ancestor[lc] = block;
block.E_path_ancestor_index[lc] = lc;
block.S_direction = LEFT_DIRECTION;
block.S_path_score[lc] = left_prefix.S_path_score[lc - 1] + score_ins;
if (block.S_path_score[lc] <= 0)
{
block.S_path_score[lc] = 0;
block.S_direction = STOP_DIRECTION;
}
computeOutputBorder (block, 0, col, size, lc, lr);
block.C = max (left_prefix.C, block.S_path_score[lc]);
if (block.C > max_score)
{
max_score = block.C;
max_row = 0;
max_col = col;
max_path_type = TYPE_C_PATH;
}
return block;
}
/**
* Creates and computes all information of an alignment block of the first column of
* the block table. This is a special case of the createBlock
method.
*
* @param factor1 factor of the first sequence
* @param factor2 factor of the second sequence
* @param row row index of the block in the block table
* @return the computed block
* @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
* with the sequences being aligned
* @see #createBlock createBlock
*/
protected AlignmentBlock createFirstColumnBlock (Factor factor1, Factor factor2,
int row) throws IncompatibleScoringSchemeException
{
LocalAlignmentBlock block, top_prefix;
int size, lr, lc, score_del;
lr = factor1.length();
lc = 0; // factor2.length();
size = lr + lc + 1;
block = new LocalAlignmentBlock (factor1, factor2, size);
// retrieve a pointer to top prefix
top_prefix = (LocalAlignmentBlock) getTopPrefix (block);
// compute deletion's score
score_del = scoreDeletion (factor1.getNewChar());
// first position
block.E_path_score[0] = block.dist_column[0] = 0;
block.direction[0] = STOP_DIRECTION;
block.E_path_ancestor[0] = block;
block.E_path_ancestor_index[0] = 0;
block.S_direction = TOP_DIRECTION;
block.S_path_score[0] = top_prefix.S_path_score[0] + score_del;
if (block.S_path_score[0] <= 0)
{
block.S_path_score[0] = 0;
block.S_direction = STOP_DIRECTION;
}
// compute block's data
for (int i = 1; i < size; i++)
{
block.dist_column[i] = top_prefix.dist_column[i - 1] + score_del;
block.direction[i] = TOP_DIRECTION;
block.S_path_score[i] = top_prefix.S_path_score[i - 1];
block.E_path_score[i] = top_prefix.E_path_score[i - 1];
block.E_path_ancestor[i] = top_prefix.E_path_ancestor[i - 1];
block.E_path_ancestor_index[i] = top_prefix.E_path_ancestor_index[i - 1];
if (block.dist_column[i] > block.E_path_score[i])
{
block.E_path_score[i] = block.dist_column[i];
block.E_path_ancestor[i] = block;
block.E_path_ancestor_index[i] = i;
}
}
computeOutputBorder (block, row, 0, size, lc, lr);
block.C = max (block.S_path_score[lc], top_prefix.C);
if (block.C > max_score)
{
max_score = block.C;
max_row = row;
max_col = 0;
max_path_type = TYPE_C_PATH;
}
return block;
}
/**
* Computes the output border of a block. This method works essentially in the same
* way as its global alignment counterpart:
*
* However, it also check if there is a better path starting inside the block (an
* S path) and oupdate the output border accordingly. It also checks if this block has
* any path of score higher than the maximum score found so far.
*
* @param block the block for which the output border is to be computed
* @param row row index of the block in the block table
* @param col column index of the block in the block table
* @param dim dimension of the output border
* @param lc number of columns of the block
* @param lr number of row of the block
*/
protected void computeOutputBorder (LocalAlignmentBlock block, int row, int col, int
dim, int lc, int lr)
{
int[] input = assembleInputBorder (dim, row, col, lr);
int[][] dist = assembleDistMatrix (block, dim, row, col, lc); // (AlignmentBlock)
// build an interface to the OUT matrix
out_matrix.setData (dist, input, dim, lc);
// compute source_path using SMAWK
smawk.computeColumnMaxima(out_matrix, block.source_path);
// update output border
for (int i = 0; i < dim; i++)
{
block.path_type[i] = TYPE_CROSSING_PATH;
block.output_border[i] = out_matrix.valueAt(block.source_path[i], i);
// check if there is a better path starting inside the block
// (if there is a path of equal score, preference is given
// to the S-path because it ends sooner)
if (block.S_path_score[i] >= block.output_border[i])
{
block.output_border[i] = block.S_path_score[i];
block.path_type[i] = TYPE_S_PATH;
}
// check if this block contains a score higher
// than the best path found so far
if (input[i] + block.E_path_score[i] > max_score)
{
max_score = input[i] + block.E_path_score[i];
max_row = row;
max_col = col;
max_source_index = i;
max_path_type = TYPE_E_PATH;
}
}
}
/**
* Builds an optimal local alignment between the loaded sequences after the block
* table has been computed by tracing a path back in the block table.
*
* @return an optimal global alignment
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
* @see CrochemoreLandauZivUkelson#traverseBlock
*/
protected PairwiseAlignment buildOptimalAlignment ()
throws IncompatibleScoringSchemeException
{
LocalAlignmentBlock block;
StringBuffer gapped_seq1, tag_line, gapped_seq2;
gapped_seq1 = new StringBuffer();
tag_line = new StringBuffer();
gapped_seq2 = new StringBuffer();
block = (LocalAlignmentBlock) block_table[max_row][max_col];
if (max_path_type == TYPE_C_PATH)
{
// a C-path is essentially an S-path
traverseS_Path (block, gapped_seq1, tag_line, gapped_seq2);
}
else
{
traverseBlockCrossingPath (block, gapped_seq1, tag_line, gapped_seq2);
}
return new PairwiseAlignment (gapped_seq1.toString(), tag_line.toString(),
gapped_seq2.toString(), locateScore());
}
/**
* Traverses a series of block crossing paths to retrieve an optimal alignment. A
* block-crossing path consists of a (possibly empty) S-path (a path that
* starts inside a block and ends in its output border), followed by any number of
* paths that cross a block from its input border to its output border, and ending in
* an E-path (a path that starts in the input border of a block and ends inside
* the block).
*
* @param block the block to be traversed
* @param gapped_seq1 the StringBuffer to where the gapped sequence 1 is written to
* @param tag_line the StringBuffer to where the tag_line is written to
* @param gapped_seq2 the StringBuffer to where the gapped sequence 2 is written to
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected void traverseBlockCrossingPath (LocalAlignmentBlock block,
StringBuffer gapped_seq1, StringBuffer tag_line, StringBuffer gapped_seq2)
throws IncompatibleScoringSchemeException
{
LocalAlignmentBlock ancestor;
int source, dest, ancestor_source;
int row, col;
row = max_row;
col = max_col;
// recover the E-path
source = max_source_index;
ancestor = block.E_path_ancestor[source];
ancestor_source = block.E_path_ancestor_index[source];
traverseBlock (ancestor, ancestor_source, gapped_seq1, tag_line, gapped_seq2);
// now recover crossing paths
while (true)
{
if (row == 0)
{
col = col - 1;
dest = block_table[row][col].factor2.length();
}
else if (col == 0)
{
row = row - 1;
dest = 0;
}
else
{
if (source < block.factor1.length())
{
col = col - 1;
dest = block_table[row][col].factor2.length() + source;
}
else if (source == block.factor1.length())
{
row = row - 1; col = col - 1;
dest = block_table[row][col].factor2.length();
}
else
{
row = row - 1;
dest = source - block.factor1.length();
}
}
// check if has reached the root block
if (!(row > 0 || col > 0)) break;
block = (LocalAlignmentBlock) block_table[row][col];
if (block.path_type[dest] == TYPE_S_PATH)
{
// last part, an S-path, and we're done
ancestor = (LocalAlignmentBlock) block.ancestor[dest];
traverseS_Path (ancestor, gapped_seq1, tag_line, gapped_seq2);
break;
}
source = block.source_path[dest];
ancestor = (LocalAlignmentBlock) block.ancestor[dest];
ancestor_source = source;
if (dest > block.factor2.length())
ancestor_source -= (block.factor1.length() - ancestor.factor1.length());
traverseBlock (ancestor, ancestor_source, gapped_seq1, tag_line, gapped_seq2);
}
}
/**
* Traverses an S-path of a block to retrieve a part of an optimal alignment from the
* new vertex of a block to entry in its input border. This method is essentially
* similar to the traverseBlock
. The only difference is that it uses
* the information of the S_direction field
of the
* LocalAlignmentBlock
class.
*
* @param block the block to be traversed
* @param gapped_seq1 the StringBuffer to where the gapped sequence 1 is written to
* @param tag_line the StringBuffer to where the tag_line is written to
* @param gapped_seq2 the StringBuffer to where the gapped sequence 2 is written to
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected void traverseS_Path (LocalAlignmentBlock block, StringBuffer gapped_seq1,
StringBuffer tag_line, StringBuffer gapped_seq2)
throws IncompatibleScoringSchemeException
{
char char1, char2;
while (block.S_direction != STOP_DIRECTION)
{
char1 = block.factor1.getNewChar();
char2 = block.factor2.getNewChar();
switch (block.S_direction)
{
case LEFT_DIRECTION:
gapped_seq1.insert (0, GAP_CHARACTER);
tag_line.insert (0, GAP_TAG);
gapped_seq2.insert (0, char2);
block = (LocalAlignmentBlock) getLeftPrefix (block);
break;
case DIAGONAL_DIRECTION:
gapped_seq1.insert (0, char1);
if (char1 == char2)
if (useMatchTag())
tag_line.insert (0, MATCH_TAG);
else
tag_line.insert (0, char1);
else if (scoreSubstitution(char1, char2) > 0)
tag_line.insert (0, APPROXIMATE_MATCH_TAG);
else
tag_line.insert (0, MISMATCH_TAG);
gapped_seq2.insert(0, char2);
block = (LocalAlignmentBlock) getDiagonalPrefix (block);
break;
case TOP_DIRECTION:
gapped_seq1.insert (0, char1);
tag_line.insert (0, GAP_TAG);
gapped_seq2.insert (0, GAP_CHARACTER);
block = (LocalAlignmentBlock) getTopPrefix (block);
break;
}
}
}
/**
* Returns the score of the high scoring local alignment in the block table.
*
* @return the score of the highest scoring local alignment
*/
protected int locateScore ()
{
return max_score;
}
}
neobio-0.0.20030929/src/neobio/alignment/PairwiseAlignment.java 0000644 0002656 0002032 00000016766 07717074370 023220 0 ustar tillea admin /*
* PairwiseAlignment.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
import java.io.Serializable;
/**
* This class is the product of a pairwise alignment, generated by one subclasses of
* {@linkplain PairwiseAlignmentAlgorithm}. It contains the two sequences strings with
* gaps, a score tag line, and a score value. It is typically displayed in three rows as
* in the following example of an alignment between parts of two protein sequences:
*
*
*
*
* MDEIHQLEDMFTVDSETLRKVVKHFILPHD-----MRTTKHQEELWSFIAELDSLKDFMVEQE // sequence 1
* M +I E +FTV +ETL+ V KHFILP D MRTT++ +ELW FIA DSLK F+ EQ // score tag line
* MQQIENFEKIFTVPTETLQAVTKHFILP-DATETLMRTTQNPDELWEFIA--DSLKAFIDEQF // sequence 2
*
Each column has one character of each sequence and a score tag. The same character * is displayed in all three rows when a column has an exact match (character of sequences * 1 and 2 are equal). When a mismatch occurs (substitution of different characters), the * score tag is left blank. A '+' in the score line signals a partial match (a * substitution of similar characters). The difference between a partial match and a * mismatch is that the score of a partial match is positive whereas the score of a * mismatch is zero or negative (each case is determined by the scoring scheme).
* *Gaps are usually represented by dashes ('-') and have a blank score tag. Insertions * have dashes in sequence 1 and the inserted character in sequence 2. Deletions, by * contrast, have the deleted character in sequence 1 and dashes in sequence 2.
* *Each column carries a score value for the corresponding operation (as defined by the * scoring scheme). The overall score of a pairwise alignment is the sum of all columns * scores values.
* *When the scoring schemes does not support partial matches, a match is usually * signaled by a '|' character.
* *
Note that these special characters are defined by the
* PairwiseAlignmentAlgorithm
class. Consult that class specification for the
* actual configuration. For instance, an alignment between two DNA fragmens may look like
* this:
* A--C--TAAAAAGCA--TT-AATAATAAA-A
* | | |||| ||| || ||||| ||| |
* AAGCCCTAAACCGCAAGTTTAATAA-AAATA
*
*
* This class is serializable, so it can be saved to a file (or any other output). It
* overrides the default equals
method of the Object
class to
* allow a proper comparsion of alignments produced by different algorithms or even
* different runs of the same algorithm. However, it does not override the
* hashCode
method as it is generally the case to maintain the contract for
* the hashCode
method (which states that equal objects must have equal hash
* codes). Hence, as it is, its use in a hash table is not supported.
PairwiseAlignment
instance with the specified gapped
* sequences, score tag line and score value.
*
* @param gapped_seq1 the first gapped sequence
* @param score_tag_line the score tag line
* @param gapped_seq2 the second gapped sequence
* @param score the overall score value for this alignment
*/
public PairwiseAlignment (String gapped_seq1, String score_tag_line,
String gapped_seq2, int score)
{
this.gapped_seq1 = gapped_seq1;
this.score_tag_line = score_tag_line;
this.gapped_seq2 = gapped_seq2;
this.score = score;
}
/**
* Returns the first gapped sequence.
*
* @return first gapped sequence
*/
public String getGappedSequence1 ()
{
return gapped_seq1;
}
/**
* Returns the score tag line.
*
* @return score tag line
*/
public String getScoreTagLine ()
{
return score_tag_line;
}
/**
* Returns the second gapped sequence.
*
* @return second gapped sequence
*/
public String getGappedSequence2 ()
{
return gapped_seq2;
}
/**
* Returns the score for this alignment.
*
* @return overall score for this alignment
*/
public int getScore ()
{
return score;
}
/**
* Returns a four-line String representation of this alignment in the following
* order: first gapped sequence, score tag line, second gapped sequence and the
* score value.
*
* @return a String representation of this scoring matrix
*/
public String toString ()
{
return gapped_seq1 + "\n" + score_tag_line + "\n"
+ gapped_seq2 + "\nScore: " + score;
}
/**
* Compares this object to the specified object. The result is true
if
* and only if the argument is not null
and is an
* PairwiseAlignment
object that contains the same values as this object,
* i.e. the same gapped sequences, the same score tag line and the same score.
*
* @param obj the object to compare with
* @return true
if objects are the same, false
otherwise
*/
public boolean equals (Object obj)
{
if (!(obj instanceof PairwiseAlignment))
return false;
PairwiseAlignment another_pa = (PairwiseAlignment) obj;
if (this.score != another_pa.score)
return false;
if (!this.gapped_seq1.equals(another_pa.gapped_seq1))
return false;
if (!this.score_tag_line.equals(another_pa.score_tag_line))
return false;
if (!this.gapped_seq2.equals(another_pa.gapped_seq2))
return false;
return true;
}
}
neobio-0.0.20030929/src/neobio/alignment/AlignmentBlock.java 0000644 0002656 0002032 00000010067 07725254634 022455 0 ustar tillea admin /*
* AlignmentBlock.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
/**
* This class is used by the {@linkplain CrochemoreLandauZivUkelson} algorithm to store
* the information of an alignment block. All fields are public (but final) in order to
* simplify the access to the data.
*
* For more information on how this class is used, please refer to the specification
* of the CrochemoreLandauZivUkelson
class and it subclasses.
source_path
and
* ancestor
arrays. Moreover, its dist_column
and
* output_border
arrays are set to zero, and the direction
* array is set to contain an STOP_DIRECTION
.
*
* @param factor1 factor of the first sequence being aligned
* @param factor2 factor of the second sequence being aligned
*/
public AlignmentBlock (Factor factor1, Factor factor2)
{
this.factor1 = factor1;
this.factor2 = factor2;
dist_column = output_border = new int[] {0};
direction = new byte [] {0}; // STOP_DIRECTION
source_path = null;
ancestor = null;
}
/**
* Creates a new alignment block, with all arrays created with the specified size.
*
* @param factor1 factor of the first sequence being aligned
* @param factor2 factor of the second sequence being aligned
* @param size size of the arrays to be created
*/
public AlignmentBlock (Factor factor1, Factor factor2, int size)
{
this.factor1 = factor1;
this.factor2 = factor2;
dist_column = new int[size];
output_border = new int[size];
direction = new byte[size];
source_path = new int[size];
ancestor = new AlignmentBlock[size];
}
}
neobio-0.0.20030929/src/neobio/alignment/FactorSequence.java 0000644 0002656 0002032 00000021721 07717074374 022474 0 ustar tillea admin /*
* FactorSequence.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
import java.io.Reader;
import java.io.BufferedReader;
import java.io.IOException;
/**
* This class builds a list of factors of a character sequence as induced by its
* Lempel-Ziv (LZ78) factorisation. Each factor is enconded as the longest factor
* previously seen plus one character.
*
* The input can come from any source, provided it is encapsulated in a proper
* Reader
instance. The stream is expected to be ready (i.e. the next
* read
operation must return the first character of the sequence) and it is
* not closed when its end is reached, so the client is allowed to reset it and maybe use
* it for another purpose.
Sequences can contain letters only although lines started with the
* COMMENT_CHAR
character ('>') are regarded as comments and are completely
* skipped. White spaces (including tabs, line feeds and carriage returns) are also
* ignored throughout.
This class uses a {@linkplain Trie} to keep track of a list of factors. Each node of * the trie contains a {@linkplain Factor} of the text. As the sequence is read from the * input, the trie is traversed as far as possible. When a leaf node is reached (which * means that the longest prefix of the input has been found), two tasks are * accomplished:
* *Factor
is created with the character at the current position of
* the input and the leaf node's factor;
* Each factor also receives a serial number according to the order they are found and * a pointer to the next factor (in that order) for fast access. This pointer, together * with the factor's ancestor pointer forms a doubly-linked list of factors. The original * text can then be reconstructed simply by following the linked list and writing out its * factors.
* *As an example, the sequence ACTAAACCGCATTAATAATAAAA
is parsed into the
* following 12 factors:
* 0 ( , ) = empty
* 1 (0,A) = A
* 2 (0,C) = C
* 3 (0,T) = T
* 4 (1,A) = AA
* 5 (1,C) = AC
* 6 (2,G) = CG
* 7 (2,A) = CA
* 8 (3,T) = TT
* 9 (4,T) = AAT
* 10 (9,A) = AATA
* 11 (4,A) = AAA
*
* serial # (prefix, new char) = factor text
*
*
* This class is used by {@linkplain CrochemoreLandauZivUkelson} algorithm to speed up * the classic dynamic programming approach to sequence alignment.
* * @author Sergio A. de Carvalho Jr. * @see Factor * @see Trie * @see CrochemoreLandauZivUkelson */ public class FactorSequence { /** * The character used to start a comment line in a sequence file. When this character * is found, the rest of the line is ignored. */ protected static final char COMMENT_CHAR = '>'; /** * A pointer to the root factor, the one that starts the list of factors. */ protected Factor root_factor; /** * The numbers of character represented by this sequence. */ protected int num_chars; /** * The numbers of factors generated by the LZ78 parsing of the sequence. */ protected int num_factors; /** * Creates a new instance of aFactorSequence
, loading the sequence data
* from the Reader
input stream. A doubly-linked list of factors is built
* according to its LZ78 factorisation.
*
* @param reader source of characters for this sequence
* @throws IOException if an I/O exception occurs when reading the input
* @throws InvalidSequenceException if the input does not contain a valid sequence
*/
public FactorSequence (Reader reader)
throws IOException, InvalidSequenceException
{
BufferedReader input = new BufferedReader(reader);
Trie root_node, current_node, new_node = null;
Factor current_factor, last_factor, new_factor;
int ch;
char c;
// create root factor and the root node of the trie
root_factor = new Factor ();
root_node = new Trie (root_factor);
num_factors = 1;
num_chars = 0;
current_node = root_node;
last_factor = root_factor;
// read characters from the input
while ((ch = input.read()) != -1)
{
c = (char) ch;
if (c == COMMENT_CHAR)
// it's a comment line: skip it!
input.readLine();
// accept letters only
else if (Character.isLetter(c))
{
num_chars++;
// walk down the trie as far as possible
new_node = current_node.spellDown(c);
if (new_node != null)
{
current_node = new_node;
}
else
{
// the longest factor of the input has been found,
// now create a new factor from the current node's factor
current_factor = (Factor) current_node.getData();
new_factor = new Factor (current_factor, num_factors, c);
// add the new character to the trie as well
current_node.add (new_factor, c);
// set up a pointer from the last factor to the new one
last_factor.setNext (new_factor);
last_factor = new_factor;
// restart at the root of the trie
current_node = root_node;
num_factors++;
}
}
// anything else, except whitespaces, will throw an exception
else if (!Character.isWhitespace(c))
throw new InvalidSequenceException
("Sequences can contain letters only.");
}
// if new_node is not null, the last factor is actually
// not a new factor but a factor already created
if (new_node != null)
{
// no new node is created, just point the last_factor to an
// existing one that represents the last characters of the text
last_factor.setNext((Factor) new_node.getData());
num_factors++;
}
// check if read anything useful!
if (num_factors <= 1)
throw new InvalidSequenceException ("Empty sequence.");
}
/**
* Returns the root factor, the one that starts the list of factors.
*
* @return root factor
*/
public Factor getRootFactor ()
{
return root_factor;
}
/**
* Returns the number of factors produced by the LZ78 parsing of the text.
*
* @return number of factors
*/
public int numFactors()
{
return num_factors;
}
/**
* Returns the number of characters of the original sequence.
*
* @return number of characters of the original sequence
*/
public int numChars ()
{
return num_chars;
}
/**
* Reconstructs the sequence from the list of factors induced by the LZ78 parsing of
* the text.
*
* @return the original sequence
*/
public String toString ()
{
StringBuffer buf = new StringBuffer();
Factor node;
node = root_factor.getNext();
for (int i = 1; i < numFactors(); i++)
{
buf.append(node);
node = node.getNext();
}
return buf.toString();
}
/**
* Returns a string representation of the actual list of factors produced by the LZ78
* parsing of the text. Each factor is printed out in a separate line, in the order
* they appear in the text, with its serial number, its ancestor's serial number, its
* new character, length and a string representation of the factor itself.
*
* @return a string representation of the list of factors
*/
public String printFactors ()
{
StringBuffer buf = new StringBuffer();
Factor factor;
factor = root_factor.getNext();
for (int i = 1; i < numFactors(); i++)
{
buf.append (factor.getSerialNumber() + "\t<");
buf.append (factor.getAncestor().getSerialNumber() + " ,\t");
buf.append (factor.getNewChar() + ">\t");
buf.append (factor.length() + "\t" + factor + "\n");
factor = factor.getNext();
}
buf.append(numFactors() + " factors\n");
return buf.toString();
}
}
neobio-0.0.20030929/src/neobio/alignment/PairwiseAlignmentAlgorithm.java 0000644 0002656 0002032 00000041112 07717234200 025034 0 ustar tillea admin /*
* PairwiseAlignmentAlgorithm.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
import java.io.Reader;
import java.io.IOException;
/**
* This abstract class is the superclass of all classes implementing pairwise sequence
* alignment algorithms. Subclasses are required to provide methods to build a high
* scoring alignment between two sequences and compute its score with a given scoring
* scheme.
*
* Clients are required to set a scoring scheme and load two sequences before * requesting an alignment or the computation of its score. They typically make the * following sequence of method calls:
* *
* // prepare
* PairwiseAlignmentAlgorithm algorithm = new SomePairwiseAlignmentAlgorith ();
* algorithm.setScoringScheme (some_scoring_scheme);
* algorithm.loadSequences (sequence1, sequence2);
*
* // now compute the alignment
* PairwiseAlignment alignment = algorithm.getPairwiseAlignment();
* int score = algorithm.getScore();
*
*
* @author Sergio A. de Carvalho Jr.
* @see PairwiseAlignment
*/
public abstract class PairwiseAlignmentAlgorithm
{
/**
* Tag character that signals a match in the score tag line of an alignment. Its use
* is conditioned by the use_match_tag
flag.
*
* @see #use_match_tag
* @see #useMatchTag
*/
protected static final char MATCH_TAG = '|';
/**
* Tag character that signals an approximate match in the score tag line of an
* alignment.
*/
protected static final char APPROXIMATE_MATCH_TAG = '+';
/**
* Character that signals a mismatch in the score tag line of an alignment.
*/
protected static final char MISMATCH_TAG = ' ';
/**
* Character that signals a gap in the score tag line of an alignment.
*/
protected static final char GAP_TAG = ' ';
/**
* Character that signals a gap in sequence.
*/
protected static final char GAP_CHARACTER = '-';
/**
* Indicates if the MATCH_TAG
tag should be used or not. If it is
* true
, the alignment algorithm should write the MATCH_TAG
* tag in the score tag line of the alignment whenever a match occurs between
* characters of the two sequences. If it is false
the matching character
* should be written instead. This flag is updated whenever a scoring scheme is set to
* this PairwiseAlignmentAlgorithm
by the setScoringScheme
* method.
*
* @see #MATCH_TAG
* @see #useMatchTag
* @see #setScoringScheme
*/
protected boolean use_match_tag;
/**
* The scoring scheme used to compute a pairwise sequence alignment. It must be set
* before performing the alignment, and if a new scoring scheme is set, any alignment
* or score already computed is lost.
*/
protected ScoringScheme scoring;
/**
* Stores the product of the last pairwise alignment performed. It contains a string
* representation of a highest scoring alignment between the two sequences and its
* score. It is set after a successful execution of the
* computePairwiseAlignment
method that subclasses must implement. It is
* set to null if new sequences are loaded or a new scoring scheme is set.
*/
protected PairwiseAlignment alignment;
/**
* This field stores just the score of the last pairwise alignment performed (if the
* score_computed flag
is set to true). It is useful when just the score
* is needed (and not the alignment itselft). Its value is set after a successful
* execution of both computePairwiseAlignment
or
* computeScore
methods that subclasses must implement. If new sequences
* are loaded or a new scoring scheme is set, the score_computed
flag is
* set to false, and this field's value becomes undefined.
*/
protected int score;
/**
* Flags whether the score of the alignment between the last two loaded sequences has
* already been computed. It is set to true after a successful execution of both
* computePairwiseAlignment
or computeScore
methods that
* subclasses must implement. It is set to falsef if new sequences are loaded or a new
* scoring scheme is set.
*/
protected boolean score_computed = false;
/**
* Flags whether sequences have been loaded. It is set to true after subclasses
* successfully load two sequences.
*/
protected boolean sequences_loaded = false;
/**
* Sets the scoring scheme to be used for the next alignments. Any alignment or score
* already computed is lost. If the scoring scheme supports partial matches, this
* PairwiseAlignmentAlgorithm
is set not to use the
* MATCH_TAG
tag because in this case the score tag line be confusing.
* If the scoring scheme does not support partial matches, then the use of the
* MATCH_TAG
tag is enabled.
*
* @param scoring Scoring scheme to be used
* @see #MATCH_TAG
* @see ScoringScheme#isPartialMatchSupported
*/
public void setScoringScheme (ScoringScheme scoring)
{
if (scoring == null)
throw new IllegalArgumentException ("Null scoring scheme object.");
this.scoring = scoring;
// if the scoring scheme supports partial matches,
// disable the use of the MATCH_TAG character
if (scoring.isPartialMatchSupported())
this.use_match_tag = false;
else
this.use_match_tag = true;
// when a new scoring scheme is set,
// the alignment needs to be recomputed
this.alignment = null;
this.score_computed = false;
}
/**
* Tells wether the MATCH_TAG
tag should be used or not. If it returns
* true
, the alignment algorithm should write the MATCH_TAG
* tag in the score tag line of the alignment produced whenever a match occurs between
* characters of the two sequences. If it returns false
the matching
* character should be written instead. The value returned is conditioned by the
* use_match_tag
flag, which is updated whenever a scoring scheme is set
* to this PairwiseAlignmentAlgorithm
by the
* setScoringScheme
method.
*
* @return true
MATCH_TAG tag should be used,
* false
otherwise
* @see #MATCH_TAG
* @see #use_match_tag
* @see #setScoringScheme
*/
protected boolean useMatchTag ()
{
return use_match_tag;
}
/**
* Request subclasses to load the sequences according to their own needs. Any
* alignment and score already computed is lost. If no exception is raised, the loaded
* flag is set to true. Subclasses typically store the sequences in instances of an
* appropiate class and each can have its own contract, so check each algorithm to see
* what kind of sequences it produces. Input can come from any source provided they
* are encapsulated with a proper Reader. They must be ready to be read, i.e. the next
* read operation must return the sequence's first character.
*
* @param input1 First sequence
* @param input2 Second sequence
* @throws IOException If an I/O error occurs when reading the sequences
* @throws InvalidSequenceException If the sequences are not valid
*/
public void loadSequences (Reader input1, Reader input2)
throws IOException, InvalidSequenceException
{
// when new sequences are loaded, the
// alignment and score needs to be recomputed
this.alignment = null;
this.score_computed = false;
// make sure that if an exception is raised
// the sequences_loaded flag is false
this.sequences_loaded = false;
// request subclasses to load sequences
loadSequencesInternal (input1, input2);
// if no exception is raised,
// set the loaded flag to true
this.sequences_loaded = true;
}
/**
* Frees pointer to loaded sequences and computed alignments (if any) so that their
* data can be garbage collected.
*/
public void unloadSequences ()
{
// allow any alignment already computed
// to be garbage collected
this.alignment = null;
this.score_computed = false;
// request subclasses to unload sequences
unloadSequencesInternal ();
this.sequences_loaded = false;
}
/**
* Return the last pairwise alignment computed (if any) or request subclasses to
* compute one and return the result by calling the
* computePairwiseAlignment
method. The sequences must already be loaded
* and a scoring scheme must already be set.
*
* @return a pairwise alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme
* is not compatible with the loaded sequences
* @see #computePairwiseAlignment
*/
public PairwiseAlignment getPairwiseAlignment ()
throws IncompatibleScoringSchemeException
{
if (!sequences_loaded)
throw new IllegalStateException ("Sequences have not been loaded.");
if (scoring == null)
throw new IllegalStateException ("Scoring scheme has not been set.");
if (this.alignment == null)
{
// make sure the scoring scheme won't be changed
// in the middle of the alignment computation
synchronized (scoring)
{
// compute the alignment if it hasn't been computed yet
this.alignment = computePairwiseAlignment();
}
// store the score as well
this.score = this.alignment.getScore();
this.score_computed = true;
}
return this.alignment;
}
/**
* Returns the score of the last alignment computed (if any) or request subclasses to
* compute one and return the result by calling the computeScore
method.
* The sequences must already be loaded and a scoring scheme must already be set.
*
* @return the score of the alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme
* is not compatible with the loaded sequences
* @see #computeScore
*/
public int getScore () throws IncompatibleScoringSchemeException
{
if (!sequences_loaded)
throw new IllegalStateException ("Sequences have not been loaded.");
if (scoring == null)
throw new IllegalStateException ("Scoring scheme has not been set.");
if (!score_computed)
{
// make sure the scoring scheme won't be changed
// in the middle of the alignment computation
synchronized (scoring)
{
// compute the alignment's score if it hasn't been computed yet
this.score = computeScore();
}
this.score_computed = true;
}
return this.score;
}
/**
* Subclasses must implement this method to load sequences according to their own
* needs and throw an exception in case of any failure. If no exception is raised, the
* loaded flag is set to true by the public method and the sequences are believed to
* be loaded (so an alignment or score can be requested).
*
* @param input1 First sequence
* @param input2 Second sequence
* @throws IOException If an I/O error occurs when reading the sequences
* @throws InvalidSequenceException If the sequences are not valid
* @see #loadSequences
* @see CharSequence
* @see FactorSequence
*/
protected abstract void loadSequencesInternal (Reader input1, Reader input2)
throws IOException, InvalidSequenceException;
/**
* Subclasses must implement this method to unload sequences according to their own
* storage, freeing pointers to sequences and any intermediate data so that they can
* be garbage collected. This methid is called by the public
* unloadSequences
method.
*
* @see #unloadSequences
*/
protected abstract void unloadSequencesInternal ();
/**
* Subclasses must implement this method to compute an alignment between the loaded
* sequences using the scoring scheme previously set. This methid is called by the
* getPairwiseAlignment
method when needed.
*
* @return a pairwise alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme
* is not compatible with the loaded sequences
* @see #getPairwiseAlignment
*/
protected abstract PairwiseAlignment computePairwiseAlignment ()
throws IncompatibleScoringSchemeException;
/**
* Subclasses must implement this method to compute the score of the alignment between
* the loaded sequences using the scoring scheme previously set. This methid is called
* by the getScore
method when needed.
*
* @return the score of the alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme
* is not compatible with the loaded sequences
* @see #getScore
*/
protected abstract int computeScore () throws IncompatibleScoringSchemeException;
/**
* Helper method to invoke the scoreSubstitution
method of the scoring
* scheme set to this algorithm.
*
* @param a first character
* @param b second character
* @return score of substitution of a
for b
* @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
* with the sequences being aligned
* @see ScoringScheme#scoreSubstitution
*/
protected final int scoreSubstitution (char a, char b)
throws IncompatibleScoringSchemeException
{
return scoring.scoreSubstitution (a, b);
}
/**
* Helper method to invoke the scoreInsertion
method of the scoring
* scheme set to this algorithm.
*
* @param a the character to be inserted
* @return score of insertion of a
* @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
* with the sequences being aligned
* @see ScoringScheme#scoreInsertion
*/
protected final int scoreInsertion (char a) throws IncompatibleScoringSchemeException
{
return scoring.scoreInsertion (a);
}
/**
* Helper method to invoke the scoreDeletion
method of the scoring scheme
* set to this algorithm.
*
* @param a the character to be deleted
* @return score of deletion of a
* @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
* with the sequences being aligned
* @see ScoringScheme#scoreDeletion
*/
protected final int scoreDeletion (char a) throws IncompatibleScoringSchemeException
{
return scoring.scoreDeletion (a);
}
/**
* Helper method to compute the the greater of two values.
*
* @param v1 first value
* @param v2 second value
* @return the larger of v1
and v2
*/
protected final int max (int v1, int v2)
{
return (v1 >= v2) ? v1 : v2;
}
/**
* Helper method to compute the the greater of three values.
*
* @param v1 first value
* @param v2 second value
* @param v3 third value
* @return the larger of v1
, v2
and v3
*/
protected final int max (int v1, int v2, int v3)
{
return (v1 >= v2) ? ((v1 >= v3)? v1 : v3) : ((v2 >= v3)? v2 : v3);
}
/**
* Helper method to compute the the greater of four values.
*
* @param v1 first value
* @param v2 second value
* @param v3 third value
* @param v4 fourth value
* @return the larger of v1
, v2
v3
and
* v4
*/
protected final int max (int v1, int v2, int v3, int v4)
{
int m1 = ((v1 >= v2) ? v1 : v2);
int m2 = ((v3 >= v4) ? v3 : v4);
return (m1 >= m2) ? m1 : m2;
}
}
neobio-0.0.20030929/src/neobio/alignment/InvalidSequenceException.java 0000644 0002656 0002032 00000005034 07717074366 024523 0 ustar tillea admin /*
* InvalidSequenceException.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
/**
* Signals that the sequence does not comply with the specification (see
* {@linkplain CharSequence} or {@linkplain FactorSequence} for details).
*
* @author Sergio A. de Carvalho Jr.
* @see CharSequence
* @see FactorSequence
*/
public class InvalidSequenceException extends Exception
{
/**
* Constructs an InvalidSequenceException
with null as its
* error detail message.
*/
public InvalidSequenceException ()
{
super();
}
/**
* Constructs an InvalidSequenceException
with the specified
* detail message.
*
* @param message an error message
*/
public InvalidSequenceException (String message)
{
super(message);
}
/**
* Constructs an InvalidSequenceException
with the specified
* cause (and a detail message that typically contains the class and detail message
* of cause).
*
* @param cause a cause
*/
public InvalidSequenceException (Throwable cause)
{
super(cause);
}
/**
* Constructs an InvalidSequenceException
with the specified
* detail message and cause.
*
* @param message an error message
* @param cause a cause
*/
public InvalidSequenceException (String message, Throwable cause)
{
super(message, cause);
}
}
neobio-0.0.20030929/src/neobio/alignment/CharSequence.java 0000644 0002656 0002032 00000012033 07717074372 022125 0 ustar tillea admin /*
* CharSequence.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
import java.io.Reader;
import java.io.BufferedReader;
import java.io.IOException;
/**
* This class implements a sequence of characters stored as an array that provides random
* access to any position in constant time.
*
* The input can come from any source, provided it is encapsulated in a proper
* Reader
instance. The stream is expected to be ready (i.e. the next
* read
operation must return the first character of the sequence) and it is
* not closed when its end is reached, so the client is allowed to reset it and maybe use
* it for another purpose.
Sequences can contain letters only although lines started with the
* COMMENT_CHAR
character ('>') are regarded as comments and are completely
* skipped. White spaces (including tabs, line feeds and carriage returns) are also
* ignored throughout.
This class is used by two sequence alignment algorithms: {@linkplain SmithWaterman} * and {@linkplain NeedlemanWunsch}.
* * @author Sergio A. de Carvalho Jr. * @see SmithWaterman * @see NeedlemanWunsch */ public class CharSequence { /** * The character used to start a comment line in a sequence file. When this character * is found, the rest of the line is ignored. */ protected static final char COMMENT_CHAR = '>'; /** * Stores the sequence as an array of characters. */ protected char sequence[]; /** * Creates a new instance of aCharSequence
, loading the sequence data
* from the Reader
input stream.
*
* @param reader source of characters for this sequence
* @throws IOException if an I/O exception occurs when reading the input
* @throws InvalidSequenceException if the input does not contain a valid sequence
*/
public CharSequence (Reader reader) throws IOException, InvalidSequenceException
{
int ch;
char c;
BufferedReader input = new BufferedReader(reader);
StringBuffer buf = new StringBuffer();
// read characters
while ((ch = input.read()) != -1)
{
// conver to char
c = (char) ch;
// skip line if comment character is found
if (c == COMMENT_CHAR)
input.readLine();
// accept letters only
else if (Character.isLetter(c))
buf.append(c);
// anything else, except whitespaces, will throw an exception
else if (!Character.isWhitespace(c))
throw new InvalidSequenceException
("Sequences can contain letters only.");
}
// check if read anything!
if (buf.length() > 0)
sequence = new char[buf.length()];
else
throw new InvalidSequenceException ("Empty sequence.");
// copy data to
buf.getChars(0, buf.length(), sequence, 0);
}
/**
* Returns the number of characters of this sequence.
*
* @return int number of characters of this sequence
*/
public int length ()
{
return sequence.length;
}
/**
* Returns the character at a given position. For the client, the first character is
* at position 1, while the last character is at position length()
. This
* is convinient for sequence alignment algorithms based on a classic dynamic
* programming matrix since the sequences usually start at row/column 1. This method
* does not check boundaries, therefore an ArrayIndexOutOfBoundsException
* may be raised if pos
is out of bounds.
*
* @param pos position of character (from 1 to length()
inclusive)
* @return the character
*/
public char charAt (int pos)
{
// convert from one-based to zero-based index
return sequence[pos-1];
}
/**
* Returns a string representation of the sequence.
*
* @return a string representation of the sequence
*/
public String toString ()
{
return new String(sequence);
}
}
neobio-0.0.20030929/src/neobio/alignment/IncompatibleScoringSchemeException.java 0000644 0002656 0002032 00000005117 07717074376 026527 0 ustar tillea admin /*
* IncompatibleScoringSchemeException.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
/**
* Signals that an scoring scheme is not compatible with the sequences being aligned.
*
* @author Sergio A. de Carvalho Jr.
* @see ScoringScheme
* @see PairwiseAlignmentAlgorithm
*/
public class IncompatibleScoringSchemeException extends Exception
{
/**
* Constructs an IncompatibleScoringSchemeException
with null as its
* error detail message.
*/
public IncompatibleScoringSchemeException ()
{
super();
}
/**
* Constructs an IncompatibleScoringSchemeException
with the specified
* detail message.
*
* @param message an error message
*/
public IncompatibleScoringSchemeException (String message)
{
super(message);
}
/**
* Constructs an IncompatibleScoringSchemeException
with the specified
* cause (and a detail message that typically contains the class and detail message
* of cause).
*
* @param cause a cause
*/
public IncompatibleScoringSchemeException (Throwable cause)
{
super(cause);
}
/**
* Constructs an IncompatibleScoringSchemeException
with the specified
* detail message and cause.
*
* @param message an error message
* @param cause a cause
*/
public IncompatibleScoringSchemeException (String message, Throwable cause)
{
super(message, cause);
}
}
neobio-0.0.20030929/src/neobio/alignment/Trie.java 0000644 0002656 0002032 00000025134 07723762434 020470 0 ustar tillea admin /*
* Trie.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
/**
* This class implements a trie, or a digital search tree. A trie is a multiway tree
* (each node can have multiple children) that represents a set of strings.
*
* Each node contains data encapsulated in an object instance. Each edge spells out a * character and each path from the root represents a string described by the characters * labelling the traversed edges. Moreover, for each string represented, there is a unique * path from the root.
* *The trie of the following example represents the strings 'a', 'd', 'b', 'ac', 'ba', * 'be', 'bd', 'bad' and 'bae'.
* *
* [0]
* --+--
* / | \
* a/ d| \b
* [1] [2] [4]
* | --+--
* | / | \
* c| a/ e| d\
* [3] [5] [6] [9]
* --+--
* / \
* d/ e\
* [7] [8]
*
*
* It is easy to see that strings with common prefixes will branch off from each other * at the first distinguishing character. This feature makes the trie a good data * structure to identify and represent phrases of a text such as the ones induced by the * Lempel-Ziv familiy of compression algorithms. For instance, the LZ78 version parses * the text into phrases, where each phrase is the longest matching phrase seen previously * plus one character.
* *In this implementation, each node is actually an instance of this class. To build a * trie, one must first create the root using the public constructor:
* *
* Trie root = new Trie (some_object);
*
*
* Here some_object
contains any relevant information encapsulated in an
* object instance. Typically, that's the only moment the public constructor is used. From
* now on, all new nodes will be added as a new child of one existing node using the
* add
method:
* new_node = any_node.add (some_object, character);
*
*
* Here character
is the character that will label the edge from
* any_node
to new_node
. Note that this transition must not
* already exist, otherwise an exception is raised.
*
*
To find the longest prefix of a given string, we follow a path from the root down
* the tree, character by character, with the spellDown
method:
* next_node = root;
* while (next_node != null)
* {
* current_node = next_node;
* char c = get next character from somewhere
* next_node = current_node.spellDown (c);
* }
*
*
* spellDown
follows the edge out of current_node
labelled by
* the character c
and returns the next node. If there is no such a path, it
* returns null.
To retrieve the information stored at any node, simply use the getData
* method.
In fact, there are many ways to implement a trie. To avoid wasting space with * multiple pointers at each node, this implementation uses an approach with a linked list * of siblings. Each node actually contains a pointer to one of its children and a pointer * to one of its siblings only. Together with the pointers, each node also stores the * character that labels the edge to the pointed node.
*
*
*
*
* [0]
* |
* a| d b
* [1]---[2]---[4]
* | |
* c| a| e d
* [3] [5]---[6]---[9]
* |
* d| e
* [7]---[8]
*
In this way, a trie is similar to a binary tree. Although this implementation is
* more efficient in terms of space, the search for a label with a given character leaving
* a node n
is no more constant but proportional to the number of children of
* n
. In the previous example, it is necessary to traverse three edges to
* reach node 9 from node 4 with character d.
This class is used by the {@linkplain FactorSequence} to build a linked list of * factors of a sequence in a LZ78 fashion, i.e. where each factor is the longest factor * previously seen plus one character.
* * @author Sergio A. de Carvalho Jr. * @see FactorSequence */ public class Trie { /** * A pointer to the first of this node's children. */ protected Trie son; /** * The character that labels the edge from this node to the child node pointer by *son
.
*/
protected char to_son;
/**
* A pointer to this node's next sibling.
*/
protected Trie sibling;
/**
* The character that labels the edge from this node to the sibling pointer by
* sibling
.
*/
protected char to_sibling;
/**
* This node's stored data.
*/
protected Object data;
/**
* Creates a new trie node with the specified data. This constructor is typically used
* by the client only once to instantiate the root node. After that, all new nodes are
* implicitly instantiated by the add
method.
*
* @param data the data that will be associated with the new node
*/
public Trie (Object data)
{
this.son = null;
this.sibling = null;
this.data = data;
}
/**
* Returns the data associated with this node.
*
* @return data associated with this node
*/
public Object getData ()
{
return data;
}
/**
* Adds a new child to this node. The new node will be implicitly instantiated with
* the data
argument, and the edge from this node to the new node will be
* labelled by the character argument. If this node already have an edge labelled with
* this character, an exception is raised. Otherwise, the new node created and
* returned.
*
* If this node have no child, a new node is created straight away. Otherwise, the * task is assigned to its first child that will add the new node as a sibling.
* * @param data the data that will be associated with the new node * @param c the character that will label the edge from this node to the new node * @return the added node * @throws IllegalStateException if this node already have an edge labelled by *c
*/
public Trie add (Object data, char c)
{
if (son == null)
{
son = new Trie (data);
to_son = c;
return son;
}
else
{
if (to_son != c)
return son.addSibling (data, c);
else
// duplicate char
throw new IllegalStateException ("Failed to add character " + c +
" already exists.");
}
}
/**
* Adds a sibling to this node. The new node will be implicitly instantiated with
* the data
argument, and the edge from this node to the new node will be
* labelled by the character argument. If this node already have a sibling with this
* character, an exception is raised. Otherwise, the new node is created and returned.
*
* If this node have no direct sibling, a new node is created straight away. * Otherwise, the task is assigned to its next sibling.
* * @param data the data that will be associated with the new node * @param c the character that will label the edge from this node to the new node * @return the added node * @throws IllegalStateException if this node already have an edge labelled by *c
*/
protected Trie addSibling (Object data, char c)
{
if (sibling == null)
{
sibling = new Trie (data);
to_sibling = c;
return sibling;
}
else
{
if (to_sibling != c)
return sibling.addSibling (data, c);
else
// duplicate char
throw new IllegalStateException ("Failed to add character: " + c +
" already exists.");
}
}
/**
* Follows a path from this node to one of its children by spelling the character
* supplied as an argument. If there is no such a path, null
is returned.
* Otherwise, the reached child node is returned.
*
* If this node's direct child is reached with an edge labelled by the character, * it is returned straight away. Otherwise, it is assigned the task of finding another * sibling labelled with that character.
* * @param c the character that labels the path to be followed to this node's child * @return the child node reached by traversing the edge labelled byc
*/
public Trie spellDown (char c)
{
if (son == null) return null;
if (to_son == c)
return son;
else
return son.spellRight(c);
}
/**
* Follows a path from this node to one of its sibling by spelling the character
* supplied as an argument. If there is no such a path, null
is returned.
* Otherwise, the reached sibling node is returned.
*
* If this node's direct sibling is reached with an edge labelled by the character, * it is returned straight away. Otherwise, it is assigned the task of finding another * sibling labelled with that character.
* * @param c the character that labels the path to be followed to the sibling * @return the sibling node reached by traversing the edge labelled byc
*/
protected Trie spellRight (char c)
{
if (sibling == null) return null;
if (to_sibling == c)
return sibling;
else
return sibling.spellRight(c);
}
}
neobio-0.0.20030929/src/neobio/alignment/CrochemoreLandauZivUkelson.java 0000644 0002656 0002032 00000074667 07725335516 025050 0 ustar tillea admin /*
* CrochemoreLandauZivUkelson.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
import java.io.Reader;
import java.io.IOException;
/**
* This abstract class is the superclass of both global and local sequence alignment
* algorithms (with linear gap penalty function) due to Maxime Crochemore, Gad Landau and
* Michal Ziv-Ukelson (2002).
*
* This implementation derives from the paper of M.Crochemore, G.Landau and * M.Ziv-Ukelson, A Sub-quadratic Sequence Alignment Algorithm for Unrestricted Scoring * Matrices (available here as * PDF or * Postscript).
* *It employs Lempel-Ziv compression techniques to speed up the classic dynamic * programmin approach to sequence alignment (see {@linkplain NeedlemanWunsch} and * {@linkplain SmithWaterman} classes). It reduces the time and space complexity from * O(n2) to O(n2/log n). In fact, the complexity is actually O(h * n2/log n), where 0 <= h <= 1 is a real number denoting the entropy of the * text (a measure of how compressible it is). This means that, the more compressible a * sequence is, the less memory the algorithm will require, and the faster it will * run.
* *The idea behind this improvement is to identify repetitions in the sequences and * reuse the computation of their alignments. The first step is, therefore, to parse the * sequences into LZ78-like factors. LZ78 is a popular compression algorithm of the * Lempel-Ziv familiy due to J.Ziv and A.Lempel (1978). This factorisation is accomplished * by the {@linkplain FactorSequence} class (for more information about this * factorisation, please refer to the specification of that class) that builds a * doubly-linked list of factors. Each factor is an instance of the {@linkplain Factor} * class (refer to the specification of this class for more information).
* *Once the sequences have been parsed, the algorithm builds a matrix of blocks, called
* block table, that is vaguely similar to the dynamic programming matrix used by both
* NeedlemanWunsch
and SmithWaterman
. Each block contains an
* instance of an {@linkplain AlignmentBlock} (please refer to its specification for more
* information on what information is stored) and represents the alignment beteween one
* factor of each sequence. This block table is, in fact, a partition of the alignment
* graph.
Consider a block B which corresponds to the alignment of factor F1 = xa from * sequence S1 and factor F2 = yb from sequence S2. Here, F1 extends a previous factor of * S1 with character a, while F2 extends a previous factor of S2 with character b. We can * define the input border of B as the set of values at the left and top borders of block * B, and the output border as the set of values at the right and bottom borders of B. * Moreover, we can define the following prefix blocks of B:
* *Note that each factor has a pointer to its prefix factor, called ancestor (see the
* specification of the Factor
class). This pointer makes it easy to retrieve
* any of the three prefix blocks of B in constant time.
Rather than computing each value of the alignment block B, the algorithm will only * compute the values on its input and output borders. This is precisely what makes it * more efficient.
* *In this class there is a general specification of how the block table is computed * (see the {@link #computeBlockTable computeBlockTable} method for details). The actual * method depends on the subclasses. In general, there are two phases:
* *In fact, for each block, only one column of the DIST matrix needs to be computed, * all other columns are actually retrieved from its prefix blocks. This is precisely what * is accomplished by the {@link #assembleDistMatrix assembleDistMatrix} method found in * this class (it is general enough for both global and local alignment versions of the * algorithm.
* *From the DIST matrix, we obtain the OUT matrix defined as
* OUT[i,j] = I[i] + DIST[i,j]
where I is the input border array. This means
* that the OUT matrix is the DIST matrix updated by the input border of a block. The
* output border is then computed from the OUT matrix by taking the maximum value of each
* column. This class also have a general method for assembling the input border (see
* {@link #assembleInputBorder assembleInputBorder}
The OUT matrix is encoded by the {@linkplain OutMatrix} class that takes as * both a DIST matrix and an input border array. Note that it does not compute the OUT * matrix, it just stores the necessary information to retrieve a value at any * position of the matrix.
* *A naive approach to compute the output border of a block from the OUT matrix of size * n x n would take a time proportional to n2. However, it happens that, due to * the nature of the DIST matrix, both DIST and OUT matrices are Monge arrays, which * implies that they are also totally monotone. This property allows the * computation of the output border of B in linear time with the SMAWK algorithm (see the * specification of the {@linkplain Smawk} class for more information on SMAWK).
* *This class contains a general specification that is pertinent to both global and * local versions of the algorithm. For more information on each version of, please refer * to the appropriate subclass.
* *A note about the performance of these algorithms. Although theoretical * results suggest that these algorithms are faster and consume less memory than the * classical methods, in practice it is hard to realise their performance gains. * *
These algorithms are extremely complex and require the storage of many extra
* pointers and other auxiliary data for each block (see the AlignmentBlock
* class for more details). Hence, even though the space requirement is
* O(n2/log n), which is less than O(n2), in practice, for most of
* the cases these algorithms will take more time and memory space than their clasical
* counterparts (we have to keep in mind that the Big Oh notation ignores all constants
* involved).
Therefore, in order to realise the full power of these algorithms, they have to be * used with extremly large and redundant sequences. This will allow a proper * reutilisation of the computations and, maybe, provide an improvement in terms of space * and run time. For instance, it is easy to devise such a sequence if we use a * one-character alphabet because, in this case, a sequence is factorised into a series * of factors that are a prefix of the next.
* * @author Sergio A. de Carvalho Jr. * @see CrochemoreLandauZivUkelsonGlobalAlignment * @see CrochemoreLandauZivUkelsonLocalAlignment * @see NeedlemanWunsch * @see SmithWaterman * @see FactorSequence * @see AlignmentBlock * @see OutMatrix * @see Smawk * @see #computeBlockTable * @see #assembleDistMatrix */ public abstract class CrochemoreLandauZivUkelson extends PairwiseAlignmentAlgorithm { /** * A constant that indicates that the source of an optimal path has been reached in a * block and that the trace back procedure to retrieve a high scoring alignment can * stop. */ protected static final byte STOP_DIRECTION = 0; /** * A constant that indicates that the left direction must be followed to reach the * source of an optimal path in a block during the trace back procedure to retrieve a * high scoring alignment. */ protected static final byte LEFT_DIRECTION = 1; /** * A constant that indicates that the diagonal direction must be followed to reach the * source of an optimal path in a block during the trace back procedure to retrieve a * high scoring alignment. */ protected static final byte DIAGONAL_DIRECTION = 2; /** * A constant that indicates that the top direction must be followed to reach the * source of an optimal path in a block during the trace back procedure to retrieve a * high scoring alignment. */ protected static final byte TOP_DIRECTION = 3; /** * The first factorised sequence being aligned. */ protected FactorSequence seq1; /** * The second factorised sequence being aligned. */ protected FactorSequence seq2; /** * The block table, which is a matrix of alignment blocks where each block represents * the alignment between one factor of each sequence. */ protected AlignmentBlock[][] block_table; /** * Number of rows of the block table. It is determined by the number of factors of the * first sequence. */ protected int num_rows; /** * Number of columns of the block table. It is determined by the number of factors of * the second sequence. */ protected int num_cols; /** * An instance of theSmawk
class that implements the SMAWK algorithm to
* compute the column maxima of a totally monotone matrix. It is used to speed up the
* computation of the output border of a block.
*/
protected Smawk smawk = new Smawk();
/**
* An instance of the OutMatrix
class that encodes the OUT matrix of a
* block when supplied with the DIST matrix and the input border array of a block.
* Note that it does not compute the OUT matrix itselft, it just stores the necessary
* information to retrieve a value at any position of the matrix.
*
* This object is then used to compute the output border of a block with the
* Smawk
class. Note that the OutMatrix
class implements the
* Matrix
interface as required by the Smawk
class.
*
* @see Matrix
* @see Smawk
*/
protected OutMatrix out_matrix = new OutMatrix ();
/**
* Loads sequences into FactorSequence
instances. In case of any error,
* an exception is raised by the constructor of FactorSequence
(please
* check the specification of that class for specific requirements).
*
*
A FactorSequence
is an LZ78-like factorisation of the sequences
* being aligned.
*
* @param input1 Input for first sequence
* @param input2 Input for second sequence
* @throws IOException If an I/O error occurs when reading the sequences
* @throws InvalidSequenceException If the sequences are not valid
* @see FactorSequence
*/
protected void loadSequencesInternal (Reader input1, Reader input2)
throws IOException, InvalidSequenceException
{
// load sequences into instances of CharSequence
this.seq1 = new FactorSequence(input1);
this.seq2 = new FactorSequence(input2);
// determine the block table's dimensions
this.num_rows = seq1.numFactors();
this.num_cols = seq2.numFactors();
}
/**
* Frees pointers to loaded sequences and the the block table so that their data can
* be garbage collected.
*/
protected void unloadSequencesInternal ()
{
this.seq1 = null;
this.seq2 = null;
this.block_table = null;
}
/**
* Computes the block table (the result depends on subclasses, see
* computeBlockTable
for details) and requests subclasses to retrieve an
* optimal alignment between the loaded sequences. The actual product depends on the
* subclasses which can produce a global (see
* CrochemoreLandauZivUkelsonGlobalAlignment
) or local alignment (see
* CrochemoreLandauZivUkelsonLocalAlignment
).
*
*
Subclasses are required to implement the buildOptimalAlignment
* abstract method defined by this class according to their own method.
computeBlockTable
for details) and requests subclasses to locate the
* score of the highest scoring alignment between the two sequences in the block
* table. The result depends on the subclasses, and either a global alignment
* (see CrochemoreLandauZivUkelsonGlobalAlignment
) or local alignment
* score (see CrochemoreLandauZivUkelsonLocalAlignment
) will be produced.
*
* Subclasses are required to implement the locateScore
abstract
* method defined by this class according to their own method.
Note that this method calculates the similarity value only (it doesn't trace * back into the block table to retrieve the alignment itself).
* * @return the score of the highest scoring alignment between the loaded sequences * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible * with the loaded sequences. * @see CrochemoreLandauZivUkelsonGlobalAlignment * @see CrochemoreLandauZivUkelsonLocalAlignment * @see #locateScore */ protected int computeScore () throws IncompatibleScoringSchemeException { // compute block table computeBlockTable (); // get score int score = locateScore (); // allow the block table to be garbage collected block_table = null; return score; } /** * Computes the block table. This method is a general specification of how the block * table should be computed. It creates the block table according to the number of * factors of each sequence. It then goes over each position of the block table, * retrieves the corresponding factors from each sequence, and repasses the * information to the subclasses that will do the actual computation of each block * using the scoring scheme previously set. * *There are four different cases that defines four abstract methods in this class, * which subclasses must implement:
* *Note that each factor has a serial number which indicates its order in the list * of factors of a sequence. This number will match with the row and column index of * a block in the block table. For instance, if a block has factors F1 and F2 with * serial numbers 12 and 53, respectively, this means that this block is found at row * 12, column 53 of the block table.
* * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible * with the loaded sequences. * @see #createRootBlock * @see #createFirstColumnBlock * @see #createFirstRowBlock * @see #createBlock */ protected void computeBlockTable () throws IncompatibleScoringSchemeException { Factor factor1, factor2; int r, c, max_length; // create block table block_table = new AlignmentBlock[num_rows][num_cols]; // find the length of the longest sequence (number of characters) max_length = Math.max(seq1.numChars(), seq2.numChars()); // prepares the OUT matrix object out_matrix.init (max_length, scoring.maxAbsoluteScore()); // start at the root of each trie factor1 = seq1.getRootFactor(); factor2 = seq2.getRootFactor(); // check if roots' indexes are both zero if (factor1.getSerialNumber() != 0 || factor2.getSerialNumber() != 0) throw new IndexOutOfBoundsException ("Unexpected factor index."); // initiate first cell of block table block_table[0][0] = createRootBlock (factor1, factor2); // compute first row for (c = 1; c < num_cols; c++) { factor2 = factor2.getNext(); // check if factor2's index equals its column number (except for // the last factor that can be a repetition of a previous one) if (c < num_cols - 1 && factor2.getSerialNumber() != c) throw new IndexOutOfBoundsException ("Unexpected factor index."); block_table[0][c] = createFirstRowBlock (factor1, factor2, c); } for (r = 1; r < num_rows; r++) { factor1 = factor1.getNext(); // check if factor1's index equals its row number (except for // the last factor that can be a repetition of a previous one) if (r < num_rows - 1 && factor1.getSerialNumber() != r) throw new IndexOutOfBoundsException ("Unexpected factor index."); // go back to the root of sequence 2 factor2 = seq2.getRootFactor(); if (factor2.getSerialNumber() != 0) throw new IndexOutOfBoundsException ("Unexpected factor index."); // compute first column of current row block_table[r][0] = createFirstColumnBlock (factor1, factor2, r); for (c = 1; c < num_cols; c++) { factor2 = factor2.getNext(); // check if factor2's index equals its column number (except for // the last factor that can be a repetition of a previous one) if (c < num_cols - 1 && factor2.getSerialNumber() != c) throw new IndexOutOfBoundsException ("Unexpected factor index."); // compute row r, col c block_table[r][c] = createBlock (factor1, factor2, r, c); } } } /** * Assembles the DIST matrix of a block by retrieving the DIST columns of its prefix * blocks. In fact, it also stores pointers to the owner block for each column * retrieved. These pointers are later used during the trace back procedure that * builds an optimal alignment from the information computed in the block table. This * method is general enough to suit both global and local alignment versions of the * algorithm. * * @param block the block for which the DIST matrix is needed * @param dim the dimension of the DIST matrix * @param r the row index of this block in the block table * @param c the column index of this block in the block table * @param lc the number of columns of the alignment block * @return the DIST matrix */ protected int[][] assembleDistMatrix (AlignmentBlock block, int dim, int r, int c, int lc) { AlignmentBlock ancestor; Factor parent; int[][] dist; int i; dist = new int[dim][]; // columns to the left of lc parent = block.factor2.getAncestor(); for (i = lc - 1; i >= 0; i--) { ancestor = block_table[r][parent.getSerialNumber()]; block.ancestor[i] = ancestor; dist[i] = ancestor.dist_column; parent = parent.getAncestor(); } // column lc dist[lc] = block.dist_column; block.ancestor[lc] = block; // columns to the right of lc parent = block.factor1.getAncestor(); for (i = lc + 1; i < dim; i++) { ancestor = block_table[parent.getSerialNumber()][c]; block.ancestor[i] = ancestor; dist[i] = ancestor.dist_column; parent = parent.getAncestor(); } return dist; } /** * Assembles the input border of a block by retrieving the values at the output * borders of the left and top blocks. This method is general enough to suit both * global and local alignment versions of the algorithm. Note that it can be used to * assemble the input border of any block but the root one (it will cause an *ArrayIndexOutOfBoundsException
.
*
* @param dim dimension of the input border
* @param r row index of the block in the block table
* @param c column index of the block in the block table
* @param lr number of row of the block
* @return the block's input border
*/
protected int[] assembleInputBorder (int dim, int r, int c, int lr)
{
AlignmentBlock left = null, top = null;
int[] input;
int i;
input = new int [dim];
// set up pointers to the left and top blocks (if applicable)
if (c > 0) left = block_table[r][c-1];
if (r > 0) top = block_table[r-1][c];
for (i = 0; i < dim; i++)
{
if (i < lr)
{
if (left != null)
input[i] = left.output_border[left.factor2.length() + i];
else
// there is no block to the left, so set a big negative value
// to make sure it will not be used (unfortunately, MIN_VALUE
// can overflow to a positive value when substracted by any
// number, so we use half of it as a workaround)
input[i] = Integer.MIN_VALUE / 2;
}
else if (i == lr)
{
if (left != null)
input[i] = left.output_border[left.factor2.length() + i];
else
// no need to check if top is not null
// (because we assume this is not the root block)
input[i] = top.output_border[i - lr];
}
else
{
if (top != null)
input[i] = top.output_border[i - lr];
else
// there is no top block (see note for the left case)
input[i] = Integer.MIN_VALUE / 2;
}
}
return input;
}
/**
* Traverses a block to retrieve a part of an optimal alignment from the specified
* source in the output border to an entry in the input border.
*
* @param block the block to be traversed
* @param source the source of the path in the output border
* @param gapped_seq1 the StringBuffer to where the gapped sequence 1 is written to
* @param tag_line the StringBuffer to where the tag_line is written to
* @param gapped_seq2 the StringBuffer to where the gapped sequence 2 is written to
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected void traverseBlock (AlignmentBlock block, int source,
StringBuffer gapped_seq1, StringBuffer tag_line, StringBuffer gapped_seq2)
throws IncompatibleScoringSchemeException
{
char char1, char2;
while (block.direction[source] != STOP_DIRECTION)
{
char1 = block.factor1.getNewChar();
char2 = block.factor2.getNewChar();
switch (block.direction[source])
{
case LEFT_DIRECTION:
gapped_seq1.insert (0, GAP_CHARACTER);
tag_line.insert (0, GAP_TAG);
gapped_seq2.insert (0, char2);
block = getLeftPrefix (block);
break;
case DIAGONAL_DIRECTION:
gapped_seq1.insert (0, char1);
if (char1 == char2)
if (useMatchTag())
tag_line.insert (0, MATCH_TAG);
else
tag_line.insert (0, char1);
else if (scoreSubstitution(char1, char2) > 0)
tag_line.insert (0, APPROXIMATE_MATCH_TAG);
else
tag_line.insert (0, MISMATCH_TAG);
gapped_seq2.insert (0, char2);
block = getDiagonalPrefix (block);
source --;
break;
case TOP_DIRECTION:
gapped_seq1.insert (0, char1);
tag_line.insert (0, GAP_TAG);
gapped_seq2.insert (0, GAP_CHARACTER);
block = getTopPrefix (block);
source --;
break;
}
}
}
/**
* This method is a shorthand to retrieve the left prefix of a block from the block
* table.
*
* @param block the block
* @return the block's left prefix
*/
protected AlignmentBlock getLeftPrefix (AlignmentBlock block)
{
int prefix_row = block.factor1.getSerialNumber();
int prefix_col = block.factor2.getAncestorSerialNumber();
return block_table[prefix_row][prefix_col];
}
/**
* This method is a shorthand to retrieve the diagonal prefix of a block from the
* block table.
*
* @param block the block
* @return the block's diagonal prefix
*/
protected AlignmentBlock getDiagonalPrefix (AlignmentBlock block)
{
int prefix_row = block.factor1.getAncestorSerialNumber();
int prefix_col = block.factor2.getAncestorSerialNumber();
return block_table[prefix_row][prefix_col];
}
/**
* This method is a shorthand to retrieve the top prefix of a block from the block
* table.
*
* @param block the block
* @return the block's top prefix
*/
protected AlignmentBlock getTopPrefix (AlignmentBlock block)
{
int prefix_row = block.factor1.getAncestorSerialNumber();
int prefix_col = block.factor2.getSerialNumber();
return block_table[prefix_row][prefix_col];
}
/**
* Computes the root block of the block table. See subclasses for actual
* implementation.
*
* @param factor1 the factor of the first sequence being aligned
* @param factor2 the factor of the second sequence being aligned
* @return the root block
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected abstract AlignmentBlock createRootBlock (Factor factor1, Factor factor2)
throws IncompatibleScoringSchemeException;
/**
* Computes a block at the first row (row zero) of the block table, which corresponds
* to an alignment block between one factor of sequence 2 and an empty string. See
* subclasses for actual implementation.
*
* @param factor1 the factor of the first sequence being aligned
* @param factor2 the factor of the second sequence being aligned
* @param col the column index of block in the block table
* @return the computed block
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected abstract AlignmentBlock createFirstRowBlock (Factor factor1, Factor factor2,
int col) throws IncompatibleScoringSchemeException;
/**
* Computes a block at the first column (column zero) of the block table, which
* corresponds to an alignment block between one factor of sequence 1 and an empty
* string. See subclasses for actual implementation.
*
* @param factor1 the factor of the first sequence being aligned
* @param factor2 the factor of the second sequence being aligned
* @param row the row index of block in the block table
* @return the computed block
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected abstract AlignmentBlock createFirstColumnBlock (Factor factor1,
Factor factor2, int row) throws IncompatibleScoringSchemeException;
/**
* Computes a block of the block table, which corresponds to an alignment block
* between one factor of sequence 1 and one factor of sequence 2. See subclasses for
* actual implementation.
*
* @param factor1 the factor of the first sequence being aligned
* @param factor2 the factor of the second sequence being aligned
* @param row the row index of block in the block table
* @param col the column index of block in the block table
* @return the computed block
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected abstract AlignmentBlock createBlock (Factor factor1, Factor factor2,
int row, int col) throws IncompatibleScoringSchemeException;
/**
* Retrieves an optimal alignment between the loaded sequences. See subclasses for
* actual implementation.
*
* @return the computed block
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected abstract PairwiseAlignment buildOptimalAlignment ()
throws IncompatibleScoringSchemeException;
/**
* Locates the score of the highest scoring alignment between the two sequences in the
* block table after is thas been computed. See subclasses for actual implementation.
*
* @return the score of the highest scoring alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected abstract int locateScore ();
}
neobio-0.0.20030929/src/neobio/alignment/InvalidScoringMatrixException.java 0000644 0002656 0002032 00000005044 07717074370 025540 0 ustar tillea admin /*
* InvalidScoringMatrixException.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
/**
* Signals that the substitution matrix does not comply with the specification (see
* {@linkplain ScoringMatrix} for details).
*
* @author Sergio A. de Carvalho Jr.
* @see ScoringMatrix
*/
public class InvalidScoringMatrixException extends Exception
{
/**
* Constructs an InvalidScoringMatrixException
with null as its error
* detail message.
*/
public InvalidScoringMatrixException ()
{
super();
}
/**
* Constructs an InvalidScoringMatrixException
with the specified detail
* message.
*
* @param message an error message
*/
public InvalidScoringMatrixException (String message)
{
super(message);
}
/**
* Constructs an InvalidScoringMatrixException
with the specified cause
* (and a detail message that typically contains the class and detail message of
* cause).
*
* @param cause a cause
*/
public InvalidScoringMatrixException (Throwable cause)
{
super(cause);
}
/**
* Constructs an InvalidScoringMatrixException
with the specified detail
* message and cause.
*
* @param message an error message
* @param cause a cause
*/
public InvalidScoringMatrixException (String message, Throwable cause)
{
super(message, cause);
}
}
neobio-0.0.20030929/src/neobio/alignment/Smawk.java 0000644 0002656 0002032 00000046673 07725335744 020664 0 ustar tillea admin /*
* Smawk.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
/**
* This class implement the SMAWK algorithm to compute column maxima on a totally monotone
* matrix as described.
*
* This implementation derives from the paper of A.Aggarwal, M.Klawe, S.Moran, P.Shor, * and R.Wilber, Geometric Applications of a Matrix Searching Algorithm, * Algorithmica, 2, 195-208 (1987).
* *The matrix must be an object that implements the {@linkplain Matrix} interface. It * is also expected to be totally monotone, and the number of rows should be greater than * or equals to the number of columns. If these conditions are not met, the the result is * unpredictable and can lead to an ArrayIndexOutOfBoundsException.
* *{@link #computeColumnMaxima computeColumnMaxima} is the main public method of this * class. It computes the column maxima of a given matrix, i.e. the rows that contain the * maximum value of each column in O(n) (linear) time, where n is the number of rows. This * method does not return the maximum values itself, but just the indexes of their * rows.
* *Note that it is necessary to create an instance of this class to execute the
* computeColumnMaxima
because it stores temporary data is that instance. To
* prevent problems with concurrent access, the computeColumnMaxima
method is
* declared synchronized
.
*
*
*
*
* // create an instance of Smawk
* Smawk smawk = new Smawk();
*
* // create an array to store the result
* int col_maxima = new int [some_matrix.numColumns()];
*
* // now compute column maxima
* smawk.computeColumnMaxima (some_matrix, col_maxima)
*
Note that the array of column maxima indexes (the computation result) must be * created beforehand and its size must be equal to the number of columns of the * matrix.
* *This implementation creates arrays of row and column indexes from the original array * and simulates all operations (reducing, deletion of odd columns, etc.) by manipulating * these arrays. The benefit is two-fold. First the matrix is not required to implement * any of these this operations but only a simple method to retrieve a value at a given * position. Moreover, it tends to be faster since it uses a manipulation of these small * vectors and no row or column is actually deleted. The downside is, of course, the use * of extra memory (in practice, however, this is negligible).
* *Note that this class does not contain a computeRowMaxima
method,
* however, the computeColumnMaxima
can easily be used to compute row maxima
* by using a transposed matrix interface, i.e. one that inverts the indexes of the
* valueAt
method (returning [col,row] when [row,col] is requested) and swaps
* the number of rows by the number of columns, and vice-versa.
Another simpler method, {@link #naiveComputeColumnMaxima naiveComputeColumnMaxima}, * does the same job without using the SMAWK algorithm. It takes advantage of the monotone * property of the matrix only (SMAWK explores the stronger constraint of total * monotonicity), and therefore has a worst case time complexity of O(n * m), where n is * the number of rows and m is the number of columns. However, this method tends to be * faster for small matrices because it avoids recursions and row and column * manipulations. There is also a * {@linkplain #naiveComputeRowMaxima naiveComputeRowMaxima} method to compute row maxima * with the naive approach.
* * @author Sergio A. de Carvalho Jr. * @see Matrix */ public class Smawk { /** * A pointer to the matrix that is being manipulated. */ protected Matrix matrix; /** * The matrix's current number of rows. This reflects any deletion of rows already * performed. */ protected int numrows; /** * An array of row indexes reflecting the current state of the matrix. When rows are * deleted, the corresponding indexes are simply moved to the end of the vector. */ protected int row[]; /** * This array is used to store for each row of the original matrix, its index in the * current state of the matrix, i.e. its index in therow
array.
*/
protected int row_position[];
/**
* The matrix's current number of columns. This reflects any deletion of columns
* already performed.
*/
protected int numcols;
/**
* An array of column indexes reflecting the current state of the matrix. When columns
* are deleted, the corresponding indexes are simply moved to the end of the vector.
*/
protected int col[];
/**
* Computes the column maxima of a given matrix. It first sets up arrays of row and
* column indexes to simulate a copy of the matrix (where all operations will be
* performed). It then calls the recursive protected computeColumnMaxima
* method.
*
* The matrix is required to be an object that implements the Matrix
* interface. It is also expected to be totally monotone, and the number of rows
* should be greater than or equals to the number of columns. If it is not, the the
* result is unpredictable and can lead to an ArrayIndexOutOfBoundsException.
This method does not return the maximum values itself, but just the indexes of * their rows. Note that the array of column maxima (the computation result) must be * created beforehand and its size must be equal to the number of columns of the * matrix.
* *To prevent problems with concurrent access, this method is declared
* synchronized
.
The first step is to reduce the matrix to a quadratic size (if necessary). It
* then delete all odd columns and recursively computes column maxima for this matrix.
* Finally, using the information computed for the odd columns, it searches for
* column maxima of the even columns. The column maxima are progressively stored in
* the col_maxima
array (each recursive call will compute a set of
* column maxima).
valueAt
method
* of the matrix. It returns the value at row r
, column c
.
*
* @param r the row number of the value being retrieved
* @param c the column number of the value being retrieved
* @return the value at row r
, column c
* @see Matrix#valueAt
*/
protected final int valueAt (int r, int c)
{
return matrix.valueAt (row[r], col[c]);
}
/**
* This method simulates a deletion of odd rows by manipulating the col
* array of indexes. In fact, nothing is deleted, but the indexes are moved to the end
* of the array in a way that they can be easily restored by the
* restoreOddColumns
method using a reverse approach.
*
* @see #restoreOddColumns
*/
protected void deleteOddColumns ()
{
int tmp;
for (int c = 2; c < numcols; c = c + 2)
{
// swap column c with c/2
tmp = col[c / 2];
col[c / 2] = col[c];
col[c] = tmp;
}
numcols = ((numcols - 1) / 2 + 1);
}
/**
* Restores the col
array of indexes to the state it was before the
* deleteOddColumns
method was called. It only needs to know how many
* columns there was originally. The indexes that were moved to the end of the array
* are restored to their original position.
*
* @param original_numcols the number of columns before the odd ones were deleted
* @see #deleteOddColumns
*/
protected void restoreOddColumns (int original_numcols)
{
int tmp;
for (int c = 2 * ((original_numcols - 1) / 2); c > 0; c = c - 2)
{
// swap back column c with c/2
tmp = col[c / 2];
col[c / 2] = col[c];
col[c] = tmp;
}
numcols = original_numcols;
}
/**
* This method is the key component of the SMAWK algorithm. It reduces an n x m matrix
* (n rows and m columns), where n >= m, to an n x n matrix by deleting m - n rows
* that are guaranteed to have no maximum value for any column. The result is an
* squared submatrix matrix that contains, for each column c, the row that has the
* maximum value of c in the original matrix. The rows are deleted with the
* deleteRow
method.
*
* It uses the total monotonicity property of the matrix to identify which rows can
* safely be deleted.
*
* @see #deleteRow
*/
protected void reduce ()
{
int k = 0, reduced_numrows = numrows;
// until there is more rows than columns
while (reduced_numrows > numcols)
{
if (valueAt(k, k) < valueAt(k + 1, k))
{
// delete row k
deleteRow (reduced_numrows, k);
reduced_numrows --;
k --;
}
else
{
if (k < numcols - 1)
{
k++;
}
else
{
// delete row k+1
deleteRow (reduced_numrows, k+1);
reduced_numrows --;
}
}
}
numrows = reduced_numrows;
}
/**
* This method simulates a deletion of a row in the matrix during the
* reduce
operation. It just moves the index to the end of the array in a
* way that it can be restored afterwards by the restoreRows
method
* (nothing is actually deleted). Deleted indexes are kept in ascending order.
*
* @param reduced_rows the current number of rows in the reducing matrix
* @param k the index of the row to be deleted
* @see #restoreRows
*/
protected void deleteRow (int reduced_rows, int k)
{
int r, saved_row = row[k];
for (r = k + 1; r < reduced_rows; r++)
row[r - 1] = row[r];
for (r = reduced_rows - 1; r < (numrows - 1) && row[r+1] < saved_row; r++)
row[r] = row[r+1];
row[r] = saved_row;
}
/**
* Restores the row
array of indexes to the state it was before the
* reduce
method was called. It only needs to know how many rows there
* was originally. The indexes that were moved to the end of the array are restored to
* their original position.
*
* @param original_numrows the number of rows before the reduction was performed
* @see #deleteRow
* @see #reduce
*/
protected void restoreRows (int original_numrows)
{
int r, r2, s, d = numrows;
for (r = 0; r < d; r++)
{
if (row[r] > row[d])
{
s = row[d];
for (r2 = d; r2 > r; r2--)
row[r2] = row[r2-1];
row[r] = s;
d++;
if (d > original_numrows - 1) break;
}
}
numrows = original_numrows;
}
/**
* This is a simpler method for calculating column maxima. It does the same job as
* computeColumnMaxima
, but without complexity of the SMAWK algorithm.
*
*
The matrix is required to be an object that implements the Matrix
* interface. It is also expected to be monotone. If it is not, the result is
* unpredictable but, unlike computeColumnMaxima
, it cannot lead to an
* ArrayIndexOutOfBoundsException.
This method does not return the maximum values itself, but just the indexes of * their rows. Note that the array of column maxima (the computation result) must be * created beforehand and its size must be equal to the number of columns of the * matrix.
* *It takes advantage of the monotone property of the matrix only (SMAWK explores * the stronger constraint of total monotonicity), and therefore has a worst case time * complexity of O(n * m), where n is the number of rows and m is the number of * columns. However, this method tends to be faster for small matrices because it * avoids recursions and row and column manipulations.
* * @param matrix the matrix that will have its column maxima computed * @param col_maxima the array of column maxima (indexes of the rows containing * maximum values of each column); this is the computation result * @see #naiveComputeRowMaxima */ public static void naiveComputeColumnMaxima (Matrix matrix, int col_maxima[]) { int max_row = 0; //int last_max = 0; for (int c = 0; c < matrix.numColumns(); c ++) { for (int r = max_row; r < matrix.numRows(); r++) if (matrix.valueAt(r,c) > matrix.valueAt(max_row,c)) max_row = r; col_maxima[c] = max_row; // uncomment the following code to raise an exception when // the matrix is not monotone /* if (max_row < last_max) throw new IllegalArgumentException ("Non totally monotone matrix."); last_max = max_row; max_row = 0; */ } } /** * This is a simpler method for calculating row maxima. It does not use the SMAWK * algorithm. * *The matrix is required to be an object that implements the Matrix
* interface. It is also expected to be monotone. If it is not, the result is
* unpredictable but, unlike computeColumnMaxima
, it cannot lead to an
* ArrayIndexOutOfBoundsException.
This method does not return the maximum values itself, but just the indexes of * their columns. Note that the array of row maxima (the computation result) must be * created beforehand and its size must be equal to the number of columns of the * matrix.
* *It takes advantage of the monotone property of the matrix only (SMAWK explores * the stronger constraint of total monotonicity), and therefore has a worst case time * complexity of O(n * m), where n is the number of rows and m is the number of * columns. However, this method tends to be faster for small matrices because it * avoids recursions and row and column manipulations.
* * @param matrix the matrix that will have its row maxima computed * @param row_maxima the array of row maxima (indexes of the columns containing * maximum values of each row); this is the computation result * @see #naiveComputeColumnMaxima */ public static void naiveComputeRowMaxima (Matrix matrix, int row_maxima[]) { int max_col = 0; //int last_max = 0; for (int r = 0; r < matrix.numRows(); r++) { for (int c = max_col; c < matrix.numColumns(); c ++) if (matrix.valueAt(r,c) > matrix.valueAt(r,max_col)) max_col = c; row_maxima[r] = max_col; // uncomment the following code to raise an exception when // the matrix is not monotone /* if (max_col < last_max) throw new IllegalArgumentException ("Non-monotone matrix."); last_max = max_col; max_col = 0; */ } } /** * Prints the current state of the matrix (reflecting deleted rows and columns) in the * standard output. It can be used internally for debugging. */ protected void printMatrix () { int r, c; System.out.print("row\\col\t| "); for (c = 0; c < numcols; c++) System.out.print(col[c] + "\t"); for (r = 0; r < numrows; r++) { System.out.print(row[r] + "\n\t| "); for (c = 0; c < numcols; c++) System.out.print(matrix.valueAt(r,c) + "\t"); } } /** * Prints the contents of an object implementing the matrix interface in the standard * output. It can be used for debugging. * * @param matrix a matrix */ public static void printMatrix (Matrix matrix) { for (int r = 0; r < matrix.numRows(); r++) { for (int c = 0; c < matrix.numColumns(); c++) System.out.print(matrix.valueAt(r,c) + "\t"); System.out.print("\n"); } } } neobio-0.0.20030929/src/neobio/alignment/BasicScoringScheme.java 0000644 0002656 0002032 00000015461 07717074372 023262 0 ustar tillea admin /* * BasicScoringScheme.java * * Copyright 2003 Sergio Anibal de Carvalho Junior * * This file is part of NeoBio. * * NeoBio is free software; you can redistribute it and/or modify it under the terms of * the GNU General Public License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with NeoBio; * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, * Boston, MA 02111-1307, USA. * * Proper attribution of the author as the source of the software would be appreciated. * * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net * Department of Computer Science http://www.dcs.kcl.ac.uk * King's College London, UK http://www.kcl.ac.uk * * Please visit http://neobio.sourceforge.net * * This project was supervised by Professor Maxime Crochemore. * */ package neobio.alignment; /** * This class implements a basic scoring scheme. At least three parameters must be * provided to the constructor: the reward for a match (a substitution of equal * characters), the penalty for a mismatch (a substitution of different characters) and * the cost of a gap (an insertion or deletion of a character). Note that it only supports * an additive gap cost function. * *Although the match reward is expected to be a positive value, and the mismatch * penalty and the gap cost are expected to be negative, no attempt is made to enforce * these behaviour.
* * @author Sergio A. de Carvalho Jr. */ public class BasicScoringScheme extends ScoringScheme { /** * The reward for a match (a substitution of equal characters). */ protected int match_reward; /** * The penalty for a mismatch (a substitution of different characters). */ protected int mismatch_penalty; /** * The cost of a gap (an insertion or deletion of a character). */ protected int gap_cost; /** * The maximum absolute score that this scoring scheme can return, which is the * maximum absolute value amongmatch_reward
,
* mismatch_penalty
and gap_cost
.
*/
protected int max_absolute_score;
/**
* Creates a new instance of a basic scoring scheme with the specified values of
* match reward, mismatch penalty and gap cost. The case of characters is significant
* when subsequently computing their score.
*
* @param match_reward reward for a substitution of equal characters
* @param mismatch_penalty penalty for a substitution of different characters
* @param gap_cost cost of an insertion or deletion of any character
*/
public BasicScoringScheme (int match_reward, int mismatch_penalty, int gap_cost)
{
this (match_reward, mismatch_penalty, gap_cost, true);
}
/**
* Creates a new instance of basic scoring scheme with the specified values of
* match reward, mismatch penalty and gap cost. If case_sensitive
is
* true
, the case of characters is significant when subsequently
* computing their score; otherwise the case is ignored.
*
* @param match_reward reward for a substitution of equal characters
* @param mismatch_penalty penalty for a substitution of different characters
* @param gap_cost cost of an insertion or deletion of any character
* @param case_sensitive true
if the case of characters must be
* significant, false
otherwise
*/
public BasicScoringScheme (int match_reward, int mismatch_penalty, int gap_cost,
boolean case_sensitive)
{
super(case_sensitive);
this.match_reward = match_reward;
this.mismatch_penalty = mismatch_penalty;
this.gap_cost = gap_cost;
// store the maximum absolute score that this scoring scheme can return,
// which is the maximum absolute value among match_reward, mismatch_penalty
// and gap_cost
if (Math.abs(match_reward) >= Math.abs(mismatch_penalty))
if (Math.abs(match_reward) >= Math.abs(gap_cost))
this.max_absolute_score = Math.abs(match_reward);
else
this.max_absolute_score = Math.abs(gap_cost);
else
if (Math.abs(mismatch_penalty) >= Math.abs(gap_cost))
this.max_absolute_score = Math.abs(mismatch_penalty);
else
this.max_absolute_score = Math.abs(gap_cost);
}
/**
* Returns the score of a substitution of character a
for character
* b
according to this scoring scheme. It is match_reward
* if a
equals b
, mismatch_penalty
otherwise.
*
* @param a first character
* @param b second character
* @return match_reward
if a
equals b
,
* mismatch_penalty
otherwise.
*/
public int scoreSubstitution (char a, char b)
{
if (isCaseSensitive())
if (a == b)
return match_reward;
else
return mismatch_penalty;
else
if (Character.toLowerCase(a) == Character.toLowerCase(b))
return match_reward;
else
return mismatch_penalty;
}
/**
* Always returns gap_cost
for the insertion of any character.
*
* @param a the character to be inserted
* @return gap_cost
*/
public int scoreInsertion (char a)
{
return gap_cost;
}
/**
* Always returns gap_cost
for the deletion of any character.
*
* @param a the character to be deleted
* @return gap_cost
*/
public int scoreDeletion (char a)
{
return gap_cost;
}
/**
* Returns the maximum absolute score that this scoring scheme can return for any
* substitution, deletion or insertion, which is the maximum absolute value among
* match_reward
, mismatch_penalty
and
* gap_cost
.
*
* @return the maximum absolute value among match_reward
,
* mismatch_penalty
and gap_cost
.
*/
public int maxAbsoluteScore ()
{
return max_absolute_score;
}
/**
* Tells whether this scoring scheme supports partial matches, which it does not.
*
* @return always return false
*/
public boolean isPartialMatchSupported ()
{
return false;
}
/**
* Returns a String representation of this scoring scheme.
*
* @return a String representation of this scoring scheme
*/
public String toString ()
{
return "Basic scoring scheme: match reward = " + match_reward +
", mismatch penalty = " + mismatch_penalty + ", gap cost = " + gap_cost;
}
}
neobio-0.0.20030929/src/neobio/alignment/LocalAlignmentBlock.java 0000644 0002656 0002032 00000007351 07723462324 023425 0 ustar tillea admin /*
* LocalAlignmentBlock.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
/**
* This class is used by the {@linkplain CrochemoreLandauZivUkelsonLocalAlignment}
* algorithm to store the information of an alignment block. All fields are public (but
* final) in order to simplify the access to the data.
*
* For more information on how this class is used, please refer to the specification
* of the CrochemoreLandauZivUkelsonLocalAlignment
class.
row
column col
*/
public int valueAt (int row, int col);
/**
* Returns the number of rows that this matrix has.
*
* @return number of rows
*/
public int numRows ();
/**
* Returns the number of columns that this matrix has.
*
* @return number of columns
*/
public int numColumns ();
}
neobio-0.0.20030929/src/neobio/alignment/CrochemoreLandauZivUkelsonGlobalAlignment.java 0000644 0002656 0002032 00000030174 07725336372 030012 0 ustar tillea admin /*
* CrochemoreLandauZivUkelsonGlobalAlignment.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
/**
* This class implements the global pairwise sequence alignment algorithm (with
* linear gap penalty function) due to Maxime Crochemore, Gad Landau and Michal
* Ziv-Ukelson (2002).
*
* This implementation derives from the paper of M.Crochemore, G.Landau and * M.Ziv-Ukelson, A Sub-quadratic Sequence Alignment Algorithm for Unrestricted Scoring * Matrices (available here as * PDF or * Postscript).
* *For a general description of the algorithm, please refer to the specification of the * abstract {@linkplain CrochemoreLandauZivUkelson} superclass.
* *This class consist mainly of methods that:
* *computeOutputBorder
method to compute the block's output border.
*
* @param factor1 factor of the first sequence
* @param factor2 factor of the second sequence
* @param row row index of the block in the block table
* @param col column index of the block in the block table
* @return the computed block
* @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
* with the sequences being aligned
*/
protected AlignmentBlock createBlock (Factor factor1, Factor factor2, int row,
int col) throws IncompatibleScoringSchemeException
{
AlignmentBlock block, left_prefix, diag_prefix, top_prefix;
int size, lr, lc, score_ins, score_sub, score_del, ins, del, sub, max;
lr = factor1.length();
lc = factor2.length();
size = lr + lc + 1;
block = new AlignmentBlock (factor1, factor2, size);
// set up pointers to prefixes
left_prefix = getLeftPrefix (block);
diag_prefix = getDiagonalPrefix (block);
top_prefix = getTopPrefix (block);
// compute scores
score_ins = scoreInsertion (factor2.getNewChar());
score_sub = scoreSubstitution (factor1.getNewChar(), factor2.getNewChar());
score_del = scoreDeletion (factor1.getNewChar());
// compute dist column and direction
for (int i = 0; i < size; i++)
{
// compute optimal path to
// input border's ith position
ins = sub = del = Integer.MIN_VALUE;
if (i < size - 1)
ins = left_prefix.dist_column[i] + score_ins;
if ((i > 0) && (i < size - 1))
sub = diag_prefix.dist_column[i - 1] + score_sub;
if (i > 0)
del = top_prefix.dist_column[i - 1] + score_del;
block.dist_column[i] = max = max (ins, sub, del);
// record the direction to of the optimal
// path to input border's ith position
if (max == ins)
block.direction[i] = LEFT_DIRECTION;
else if (max == sub)
block.direction[i] = DIAGONAL_DIRECTION;
else
block.direction[i] = TOP_DIRECTION;
}
computeOutputBorder (block, row, col, size, lc, lr);
return block;
}
/**
* Creates the root block. This is a special case of the createBlock
* method. No information is actually computed.
*
* @param factor1 factor of the first sequence
* @param factor2 factor of the second sequence
* @return the root block
*/
protected AlignmentBlock createRootBlock (Factor factor1, Factor factor2)
{
return new AlignmentBlock (factor1, factor2);
}
/**
* Creates and computes all information of an alignment block of the first row of the
* block table. This is a special case of the createBlock
method.
*
* @param factor1 factor of the first sequence
* @param factor2 factor of the second sequence
* @param col column index of the block in the block table
* @return the computed block
* @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
* with the sequences being aligned
* @see #createBlock createBlock
*/
protected AlignmentBlock createFirstRowBlock (Factor factor1, Factor factor2, int col)
throws IncompatibleScoringSchemeException
{
AlignmentBlock block, left_prefix;
int size, lr, lc, score_ins;
lr = 0; // factor1.length();
lc = factor2.length();
size = lr + lc + 1;
block = new AlignmentBlock (factor1, factor2, size);
// set up pointer to left prefix
left_prefix = getLeftPrefix (block);
// compute insertion's score
score_ins = scoreInsertion (factor2.getNewChar());
// compute dist column and direction
for (int i = 0; i < lc; i++)
{
block.dist_column[i] = left_prefix.dist_column[i] + score_ins;
block.direction[i] = LEFT_DIRECTION;
}
// last position
block.dist_column[lc] = 0;
block.direction[lc] = STOP_DIRECTION;
computeOutputBorder (block, 0, col, size, lc, lr);
return block;
}
/**
* Creates and computes all information of an alignment block of the first column of
* the block table. This is a special case of the createBlock
method.
*
* @param factor1 factor of the first sequence
* @param factor2 factor of the second sequence
* @param row row index of the block in the block table
* @return the computed block
* @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
* with the sequences being aligned
* @see #createBlock createBlock
*/
protected AlignmentBlock createFirstColumnBlock (Factor factor1, Factor factor2,
int row) throws IncompatibleScoringSchemeException
{
AlignmentBlock block, top_prefix;
int size, lr, lc, score_del;
lr = factor1.length();
lc = 0; // factor2.length();
size = lr + lc + 1;
block = new AlignmentBlock (factor1, factor2, size);
// set up pointer to top prefix
top_prefix = getTopPrefix (block);
// compute deletion's score
score_del = scoreDeletion (factor1.getNewChar());
// first position
block.dist_column[0] = 0;
block.direction[0] = STOP_DIRECTION;
// compute dist column and direction
for (int i = 1; i < size; i++)
{
block.dist_column[i] = top_prefix.dist_column[i - 1] + score_del;
block.direction[i] = TOP_DIRECTION;
}
computeOutputBorder (block, row, 0, size, lc, lr);
return block;
}
/**
* Computes the output border of a block. This is performed in five steps:
*
* case_sensitive
is
* true
, the case of characters is significant when subsequently
* computing their score; otherwise the case is ignored.
*
* @param case_sensitive true
if the case of characters must be
* significant, false
otherwise
*/
public ScoringScheme (boolean case_sensitive)
{
this.case_sensitive = case_sensitive;
}
/**
* Tells whether this scoring scheme ignores the case of characters when computing
* their score.
*
* @return true
if the case of characters is significant,
* false
otherwise
*/
public boolean isCaseSensitive ()
{
return this.case_sensitive;
}
/**
* Returns the score of a substitution of character a
for character
* b
according to this scoring scheme. If this substitution is not
* defined, an exception is raised.
*
* @param a first character
* @param b second character
* @return score of substitution of a
for b
* @throws IncompatibleScoringSchemeException if this substitution is not defined
*/
public abstract int scoreSubstitution (char a, char b)
throws IncompatibleScoringSchemeException;
/**
* Returns the score of an insertion of character a
according to this
* scoring scheme. If this character is not recognised, an exception is raised.
*
* @param a the character to be inserted
* @return score of insertion of a
* @throws IncompatibleScoringSchemeException if character is not recognised by this
* scoring scheme
*/
public abstract int scoreInsertion (char a)
throws IncompatibleScoringSchemeException;
/**
* Returns the score of a deletion of character a
according to this
* scoring scheme. If this character is not recognised, an exception is raised.
*
* @param a the character to be deleted
* @return score of insertion of a
* @throws IncompatibleScoringSchemeException if character is not recognised by this
* scoring scheme
*/
public abstract int scoreDeletion (char a)
throws IncompatibleScoringSchemeException;
/**
* Returns the maximum absolute score that this scoring scheme can return for any
* substitution, deletion or insertion.
*
* @return maximum absolute score that can be returned
*/
public abstract int maxAbsoluteScore ();
/**
* Returns true
if this scoring scheme supports partial matches,
* false
otherwise. A partial match is a situation when two characters
* are not equal but, for any reason, are regarded as similar by this scoring scheme,
* which then returns a positive score. This is common when for scoring schemes
* that implement amino acid scoring matrices.
*
* @return true
if this scoring scheme supports partial matches,
* false
otherwise
*/
public abstract boolean isPartialMatchSupported ();
}
neobio-0.0.20030929/src/neobio/alignment/NeedlemanWunsch.java 0000644 0002656 0002032 00000027513 07725254026 022643 0 ustar tillea admin /*
* NeedlemanWunsch.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
import java.io.Reader;
import java.io.IOException;
/**
* This class implements the classic global alignment algorithm (with linear gap penalty
* function) due to S.B.Needleman and C.D.Wunsch (1970).
*
* It is based on a dynamic programming approach. The idea consists of, given two * sequences A and B of sizes n and m, respectively, building an (n+1 x m+1) matrix M that * contains the similarity of prefixes of A and B. Every position M[i,j] in the matrix * holds the score between the subsequences A[1..i] and B[1..j]. The first row and column * represent alignments with spaces.
* *Starting from row 0, column 0, the algorithm computes each position M[i,j] with the * following recurrence:
* *
* M[0,0] = 0
* M[i,j] = max { M[i,j-1] + scoreInsertion (B[j]),
* M[i-1,j-1] + scoreSubstitution (A[i], B[j]),
* M[i-1,j] + scoreDeletion(A[i]) }
*
*
* In the end, the value at the last position (last row, last column) will contain * the similarity between the two sequences. This part of the algorithm is accomplished * by the {@link #computeMatrix computeMatrix} method. It has quadratic space complexity * since it needs to keep an (n+1 x m+1) matrix in memory. And since the work of computing * each cell is constant, it also has quadratic time complexity.
* *After the matrix has been computed, the alignment can be retrieved by tracing a path * back in the matrix from the last position to the first. This step is performed by * the {@link #buildOptimalAlignment buildOptimalAlignment} method, and since the path can * be roughly as long as (m + n), this method has O(n) time complexity.
* *If the similarity value only is needed (and not the alignment itself), it is easy to * reduce the space requirement to O(n) by keeping just the last row or column in memory. * This is precisely what is done by the {@link #computeScore computeScore} method. Note * that it still requires O(n2) time.
* *For a more efficient approach to the global alignment problem, see the * {@linkplain CrochemoreLandauZivUkelson} algorithm. For local alignment, see the * {@linkplain SmithWaterman} algorithm.
* * @author Sergio A. de Carvalho Jr. * @see SmithWaterman * @see CrochemoreLandauZivUkelson * @see CrochemoreLandauZivUkelsonLocalAlignment * @see CrochemoreLandauZivUkelsonGlobalAlignment */ public class NeedlemanWunsch extends PairwiseAlignmentAlgorithm { /** * The first sequence of an alignment. */ protected CharSequence seq1; /** * The second sequence of an alignment. */ protected CharSequence seq2; /** * The dynamic programming matrix. Each position (i, j) represents the best score * between the firsts i characters ofseq1
and j characters of
* seq2
.
*/
protected int[][] matrix;
/**
* Loads sequences into {@linkplain CharSequence} instances. In case of any error,
* an exception is raised by the constructor of CharSequence
(please
* check the specification of that class for specific requirements).
*
* @param input1 Input for first sequence
* @param input2 Input for second sequence
* @throws IOException If an I/O error occurs when reading the sequences
* @throws InvalidSequenceException If the sequences are not valid
* @see CharSequence
*/
protected void loadSequencesInternal (Reader input1, Reader input2)
throws IOException, InvalidSequenceException
{
// load sequences into instances of CharSequence
this.seq1 = new CharSequence(input1);
this.seq2 = new CharSequence(input2);
}
/**
* Frees pointers to loaded sequences and the dynamic programming matrix so that their
* data can be garbage collected.
*/
protected void unloadSequencesInternal ()
{
this.seq1 = null;
this.seq2 = null;
this.matrix = null;
}
/**
* Builds an optimal global alignment between the loaded sequences after computing the
* dynamic programming matrix. It calls the buildOptimalAlignment
method
* after the computeMatrix
method computes the dynamic programming
* matrix.
*
* @return an optimal global alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
* @see #computeMatrix
* @see #buildOptimalAlignment
*/
protected PairwiseAlignment computePairwiseAlignment ()
throws IncompatibleScoringSchemeException
{
// compute the matrix
computeMatrix ();
// build and return an optimal global alignment
PairwiseAlignment alignment = buildOptimalAlignment ();
// allow the matrix to be garbage collected
matrix = null;
return alignment;
}
/**
* Computes the dynamic programming matrix.
*
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected void computeMatrix () throws IncompatibleScoringSchemeException
{
int r, c, rows, cols, ins, del, sub;
rows = seq1.length()+1;
cols = seq2.length()+1;
matrix = new int [rows][cols];
// initiate first row
matrix[0][0] = 0;
for (c = 1; c < cols; c++)
matrix[0][c] = matrix[0][c-1] + scoreInsertion(seq2.charAt(c));
// calculates the similarity matrix (row-wise)
for (r = 1; r < rows; r++)
{
// initiate first column
matrix[r][0] = matrix[r-1][0] + scoreDeletion(seq1.charAt(r));
for (c = 1; c < cols; c++)
{
ins = matrix[r][c-1] + scoreInsertion(seq2.charAt(c));
sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r),seq2.charAt(c));
del = matrix[r-1][c] + scoreDeletion(seq1.charAt(r));
// choose the greatest
matrix[r][c] = max (ins, sub, del);
}
}
}
/**
* Builds an optimal global alignment between the loaded sequences. Before it is
* executed, the dynamic programming matrix must already have been computed by
* the computeMatrix
method.
*
* @return an optimal global alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme
* is not compatible with the loaded sequences.
* @see #computeMatrix
*/
protected PairwiseAlignment buildOptimalAlignment ()
throws IncompatibleScoringSchemeException
{
StringBuffer gapped_seq1, score_tag_line, gapped_seq2;
int r, c, sub, max_score;
gapped_seq1 = new StringBuffer();
score_tag_line = new StringBuffer();
gapped_seq2 = new StringBuffer();
// start at the last row, last column
r = matrix.length - 1;
c = matrix[r].length - 1;
max_score = matrix[r][c];
while ((r > 0) || (c > 0))
{
if (c > 0)
if (matrix[r][c] == matrix[r][c-1] + scoreInsertion(seq2.charAt(c)))
{
// insertion was used
gapped_seq1.insert (0, GAP_CHARACTER);
score_tag_line.insert (0, GAP_TAG);
gapped_seq2.insert (0, seq2.charAt(c));
c = c - 1;
// skip to the next iteration
continue;
}
if ((r > 0) && (c > 0))
{
sub = scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
if (matrix[r][c] == matrix[r-1][c-1] + sub)
{
// substitution was used
gapped_seq1.insert (0, seq1.charAt(r));
if (seq1.charAt(r) == seq2.charAt(c))
if (useMatchTag())
score_tag_line.insert (0, MATCH_TAG);
else
score_tag_line.insert (0, seq1.charAt(r));
else if (sub > 0)
score_tag_line.insert (0, APPROXIMATE_MATCH_TAG);
else
score_tag_line.insert (0, MISMATCH_TAG);
gapped_seq2.insert (0, seq2.charAt(c));
r = r - 1; c = c - 1;
// skip to the next iteration
continue;
}
}
// must be a deletion
gapped_seq1.insert (0, seq1.charAt(r));
score_tag_line.insert (0, GAP_TAG);
gapped_seq2.insert (0, GAP_CHARACTER);
r = r - 1;
}
return new PairwiseAlignment (gapped_seq1.toString(), score_tag_line.toString(),
gapped_seq2.toString(), max_score);
}
/**
* Computes the score of the best global alignment between the two sequences using the
* scoring scheme previously set. This method calculates the similarity value only
* (doesn't build the whole matrix so the alignment cannot be recovered, however it
* has the advantage of requiring O(n) space only).
*
* @return score of the best global alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected int computeScore () throws IncompatibleScoringSchemeException
{
int[] array;
int r, c, rows, cols, tmp, ins, del, sub;
rows = seq1.length()+1;
cols = seq2.length()+1;
if (rows <= cols)
{
// goes columnwise
array = new int [rows];
// initiate first column
array[0] = 0;
for (r = 1; r < rows; r++)
array[r] = array[r-1] + scoreDeletion(seq1.charAt(r));
// calculate the similarity matrix (keep current column only)
for (c = 1; c < cols; c++)
{
// initiate first row (tmp hold values
// that will be later moved to the array)
tmp = array[0] + scoreInsertion(seq2.charAt(c));
for (r = 1; r < rows; r++)
{
ins = array[r] + scoreInsertion(seq2.charAt(c));
sub = array[r-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
del = tmp + scoreDeletion(seq1.charAt(r));
// move the temp value to the array
array[r-1] = tmp;
// choose the greatest
tmp = max (ins, sub, del);
}
// move the temp value to the array
array[rows - 1] = tmp;
}
return array[rows - 1];
}
else
{
// goes rowwise
array = new int [cols];
// initiate first row
array[0] = 0;
for (c = 1; c < cols; c++)
array[c] = array[c-1] + scoreInsertion(seq2.charAt(c));
// calculate the similarity matrix (keep current row only)
for (r = 1; r < rows; r++)
{
// initiate first column (tmp hold values
// that will be later moved to the array)
tmp = array[0] + scoreDeletion(seq1.charAt(r));
for (c = 1; c < cols; c++)
{
ins = tmp + scoreInsertion(seq2.charAt(c));
sub = array[c-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
del = array[c] + scoreDeletion(seq1.charAt(r));
// move the temp value to the array
array[c-1] = tmp;
// choose the greatest
tmp = max (ins, sub, del);
}
// move the temp value to the array
array[cols - 1] = tmp;
}
return array[cols - 1];
}
}
}
neobio-0.0.20030929/src/neobio/alignment/Factor.java 0000644 0002656 0002032 00000014334 07717074374 021005 0 ustar tillea admin /*
* Factor.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
/**
* This class is used by {@linkplain FactorSequence} to create a linked list of factors of
* a text as induced by its Lempel-Ziv (LZ78) factorisation.
*
* Each instance of this class represent a string composed of its an ancestor factor's * string plus one character, and contains: * *
Factor
. It has no ancestor and no character (both
* are set to null
). Its serial number is set to zero as well as its
* length.
*
* This constructor is used to initiate the a linked list of factors of a text. Its
* next
pointer is initially null
, but it is typically set
* to point to the first factor afterwards (with the setNext
method).
*
* @see #setNext
*/
public Factor ()
{
this.ancestor = null;
this.next = null;
this.serial_number = 0;
this.length = 0;
this.new_char = 0;
}
/**
* Creates a new Factor
instance with the specified serial number and
* new character, and pointing to the given ancestor. Its length is set to its
* ancestor's length plus 1.
*
*
Its next
pointer is initially null
, but it is
* typically set to point to the next factor afterwards (with the setNext
* method).
*
* @param ancestor this factor's ancestor
* @param serial_number this factor's serial number
* @param new_char this factor's new character
* @see #setNext
*/
public Factor (Factor ancestor, int serial_number, char new_char)
{
this.ancestor = ancestor;
this.serial_number = serial_number;
this.new_char = new_char;
if (ancestor != null)
this.length = ancestor.length() + 1;
else
throw new IllegalArgumentException ("Ancestor factor cannot be null.");
}
/**
* Sets this factor's next
pointer to point to the specified factor.
* Although the next factor has typically a serial number equal to this factor's
* serial number plus 1, no attempt is made to guarantee this rule. This allows
* special constructs or a different order in the factorisation.
*
* @param next the factor that will be pointed to
* @see #getNext
*/
public void setNext (Factor next)
{
this.next = next;
}
/**
* Returns this factor's ancestor factor.
*
* @return this factor's ancestor factor
*/
public Factor getAncestor ()
{
return ancestor;
}
/**
* This method is a shorthand to return the serial number of this factor's ancestor.
* Note that it does not check if this factor has an ancestor or not, therefore, if
* it is called on the root factor, a NullPointerException is raised.
*
* @return the serial number of this factor's ancestor
*/
public int getAncestorSerialNumber ()
{
return ancestor.getSerialNumber();
}
/**
* Returns this factor's next factor.
*
* @return this factor's next factor
* @see #setNext
*/
public Factor getNext ()
{
return next;
}
/**
* Returns this factor's serial number.
*
* @return this factor's serial number
*/
public int getSerialNumber ()
{
return serial_number;
}
/**
* Returns this factor's length.
*
* @return this factor's length
*/
public int length ()
{
return length;
}
/**
* Returns this factor's new character.
*
* @return this factor's new character
*/
public char getNewChar ()
{
return new_char;
}
/**
* Returns a string representation of the text represented by this factor. It inspects
* its chain of ancestors up until as far as the root factor, spelling their new
* characters out.
*
* @return a string representation of the text denoted by this factor
*/
public String toString ()
{
StringBuffer buf = new StringBuffer();
Factor ancestor = this;
while (ancestor.getAncestor() != null)
{
buf.insert(0, ancestor.getNewChar());
ancestor = ancestor.getAncestor();
}
return buf.toString();
}
}
neobio-0.0.20030929/src/neobio/alignment/SmithWaterman.java 0000644 0002656 0002032 00000030631 07717234470 022343 0 ustar tillea admin /*
* SmithWaterman.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
import java.io.Reader;
import java.io.IOException;
/**
* This class implement the classic local alignment algorithm (with linear gap penalty
* function) due to T.F.Smith and M.S.Waterman (1981).
*
*
This algorithm is very similar to the {@linkplain NeedlemanWunsch} algorithm for * global alignment. The idea here also consists of building an (n+1 x m+1) matrix M given * two sequences A and B of sizes n and m, respectively. However, unlike in the global * alignment case, every position M[i,j] in the matrix contains the similarity score of * suffixes of A[1..i] and B[1..j].
* *Starting from row 0, column 0, the {@link #computeMatrix computeMatrix} method * computes each position M[i,j] with the following recurrence:
* *
* M[0,0] = M[0,j] = M[i,0] = 0
* M[i,j] = max { M[i,j-1] + scoreInsertion (B[j]),
* M[i-1,j-1] + scoreSubstitution (A[i], B[j]),
* M[i-1,j] + scoreDeletion(A[i]) }
*
*
* Note that, here, all cells in the first row and column are set to zero. The best * local alignment score is the highest value found anywhere in the matrix.
* *Just like in global alignment case, this algorithm has quadratic space complexity * because it needs to keep an (n+1 x m+1) matrix in memory. And since the work of * computing each cell is constant, it also has quadratic time complexity.
* *After the matrix has been computed, the alignment can be retrieved by tracing a path * back in the matrix from the position of the highest score until a cell of value zero is * reached. This step is performed by the {@link #buildOptimalAlignment * buildOptimalAlignment} method, and its time complexity is linear on the size of the * alignment. * *
If the similarity value only is needed (and not the alignment itself), it is easy to * reduce the space requirement to O(n) by keeping just the last row or column in memory. * This is precisely what is done by the {@link #computeScore computeScore} method. Note * that it still requires O(n2) time.
* *For a more efficient approach to the local alignment problem, see the * {@linkplain CrochemoreLandauZivUkelson} algorithm. For global alignment, see the * {@linkplain NeedlemanWunsch} algorithm.
* * @author Sergio A. de Carvalho Jr. * @see NeedlemanWunsch * @see CrochemoreLandauZivUkelson * @see CrochemoreLandauZivUkelsonLocalAlignment * @see CrochemoreLandauZivUkelsonGlobalAlignment */ public class SmithWaterman extends PairwiseAlignmentAlgorithm { /** * The first sequence of an alignment. */ protected CharSequence seq1; /** * The second sequence of an alignment. */ protected CharSequence seq2; /** * The dynamic programming matrix. Each position (i, j) represents the best score * between a suffic of the firsts i characters ofseq1
and a suffix of
* the first j characters of seq2
.
*/
protected int[][] matrix;
/**
* Indicate the row of where an optimal local alignment can be found in the matrix..
*/
protected int max_row;
/**
* Indicate the column of where an optimal local alignment can be found in the matrix.
*/
protected int max_col;
/**
* Loads sequences into {@linkplain CharSequence} instances. In case of any error, an
* exception is raised by the constructor of CharSequence
(please check
* the specification of that class for specific requirements).
*
* @param input1 Input for first sequence
* @param input2 Input for second sequence
* @throws IOException If an I/O error occurs when reading the sequences
* @throws InvalidSequenceException If the sequences are not valid
* @see CharSequence
*/
protected void loadSequencesInternal (Reader input1, Reader input2)
throws IOException, InvalidSequenceException
{
// load sequences into instances of CharSequence
this.seq1 = new CharSequence(input1);
this.seq2 = new CharSequence(input2);
}
/**
* Frees pointers to loaded sequences and the dynamic programming matrix so that their
* data can be garbage collected.
*/
protected void unloadSequencesInternal ()
{
this.seq1 = null;
this.seq2 = null;
this.matrix = null;
}
/**
* Builds an optimal local alignment between the loaded sequences after computing the
* dynamic programming matrix. It calls the buildOptimalAlignment
method
* after the computeMatrix
method computes the dynamic programming
* matrix.
*
* @return an optimal pairwise alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
* @see #computeMatrix
* @see #buildOptimalAlignment
*/
protected PairwiseAlignment computePairwiseAlignment ()
throws IncompatibleScoringSchemeException
{
// compute the matrix
computeMatrix ();
// build and return an optimal local alignment
PairwiseAlignment alignment = buildOptimalAlignment ();
// allow the matrix to be garbage collected
matrix = null;
return alignment;
}
/**
* Computes the dynamic programming matrix.
*
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected void computeMatrix () throws IncompatibleScoringSchemeException
{
int r, c, rows, cols, ins, sub, del, max_score;
rows = seq1.length()+1;
cols = seq2.length()+1;
matrix = new int [rows][cols];
// initiate first row
for (c = 0; c < cols; c++)
matrix[0][c] = 0;
// keep track of the maximum score
this.max_row = this.max_col = max_score = 0;
// calculates the similarity matrix (row-wise)
for (r = 1; r < rows; r++)
{
// initiate first column
matrix[r][0] = 0;
for (c = 1; c < cols; c++)
{
ins = matrix[r][c-1] + scoreInsertion(seq2.charAt(c));
sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r),seq2.charAt(c));
del = matrix[r-1][c] + scoreDeletion(seq1.charAt(r));
// choose the greatest
matrix[r][c] = max (ins, sub, del, 0);
if (matrix[r][c] > max_score)
{
// keep track of the maximum score
max_score = matrix[r][c];
this.max_row = r; this.max_col = c;
}
}
}
}
/**
* Builds an optimal local alignment between the loaded sequences. Before it is
* executed, the dynamic programming matrix must already have been computed by
* the computeMatrix
method.
*
* @return an optimal local alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
* @see #computeMatrix
*/
protected PairwiseAlignment buildOptimalAlignment () throws
IncompatibleScoringSchemeException
{
StringBuffer gapped_seq1, score_tag_line, gapped_seq2;
int r, c, max_score, sub;
// start at the cell with maximum score
r = this.max_row;
c = this.max_col;
max_score = matrix[r][c];
gapped_seq1 = new StringBuffer();
score_tag_line = new StringBuffer();
gapped_seq2 = new StringBuffer();
while ((r > 0 || c > 0) && (matrix[r][c] > 0))
{
if (c > 0)
if (matrix[r][c] == matrix[r][c-1] + scoreInsertion(seq2.charAt(c)))
{
// insertion
gapped_seq1.insert (0, GAP_CHARACTER);
score_tag_line.insert (0, GAP_TAG);
gapped_seq2.insert (0, seq2.charAt(c));
c = c - 1;
// skip to the next iteration
continue;
}
if ((r > 0) && (c > 0))
{
sub = scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
if (matrix[r][c] == matrix[r-1][c-1] + sub)
{
// substitution
gapped_seq1.insert (0, seq1.charAt(r));
if (seq1.charAt(r) == seq2.charAt(c))
if (useMatchTag())
score_tag_line.insert (0, MATCH_TAG);
else
score_tag_line.insert (0, seq1.charAt(r));
else if (sub > 0)
score_tag_line.insert (0, APPROXIMATE_MATCH_TAG);
else
score_tag_line.insert (0, MISMATCH_TAG);
gapped_seq2.insert (0, seq2.charAt(c));
r = r - 1; c = c - 1;
// skip to the next iteration
continue;
}
}
// must be a deletion
gapped_seq1.insert (0, seq1.charAt(r));
score_tag_line.insert (0, GAP_TAG);
gapped_seq2.insert (0,GAP_CHARACTER);
r = r - 1;
}
return new PairwiseAlignment (gapped_seq1.toString(), score_tag_line.toString(),
gapped_seq2.toString(), max_score);
}
/**
* Computes the score of the best local alignment between the two sequences using the
* scoring scheme previously set. This method calculates the similarity value only
* (doesn't build the whole matrix so the alignment cannot be recovered, however it
* has the advantage of requiring O(n) space only).
*
* @return the score of the best local alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected int computeScore () throws IncompatibleScoringSchemeException
{
int[] array;
int rows = seq1.length()+1, cols = seq2.length()+1;
int r, c, tmp, ins, del, sub, max_score;
// keep track of the maximum score
max_score = 0;
if (rows <= cols)
{
// goes columnwise
array = new int [rows];
// initiate first column
for (r = 0; r < rows; r++)
array[r] = 0;
// calculate the similarity matrix (keep current column only)
for (c = 1; c < cols; c++)
{
// set first position to zero (tmp hold values
// that will be later moved to the array)
tmp = 0;
for (r = 1; r < rows; r++)
{
ins = array[r] + scoreInsertion(seq2.charAt(c));
sub = array[r-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
del = tmp + scoreDeletion(seq1.charAt(r));
// move the temp value to the array
array[r-1] = tmp;
// choose the greatest (or zero if all negative)
tmp = max (ins, sub, del, 0);
// keep track of the maximum score
if (tmp > max_score) max_score = tmp;
}
// move the temp value to the array
array[rows - 1] = tmp;
}
}
else
{
// goes rowwise
array = new int [cols];
// initiate first row
for (c = 0; c < cols; c++)
array[c] = 0;
// calculate the similarity matrix (keep current row only)
for (r = 1; r < rows; r++)
{
// set first position to zero (tmp hold values
// that will be later moved to the array)
tmp = 0;
for (c = 1; c < cols; c++)
{
ins = tmp + scoreInsertion(seq2.charAt(c));
sub = array[c-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
del = array[c] + scoreDeletion(seq1.charAt(r));
// move the temp value to the array
array[c-1] = tmp;
// choose the greatest (or zero if all negative)
tmp = max (ins, sub, del, 0);
// keep track of the maximum score
if (tmp > max_score) max_score = tmp;
}
// move the temp value to the array
array[cols - 1] = tmp;
}
}
return max_score;
}
}
neobio-0.0.20030929/src/neobio/alignment/ScoringMatrix.java 0000644 0002656 0002032 00000032555 07725336634 022364 0 ustar tillea admin /*
* ScoringMatrix.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.IOException;
/**
* This class implements a scoring scheme based on a substitution matrix. It is useful
* to represent PAM and BLOSUM family of amino acids scoring matrices. Its constructor
* loads such matrices from a file (or any other character stream). The following is an
* extract of a BLOSUM62 scoring matrix file:
*
* A R N D C Q E G H I L K M F P S T W Y V B Z X *
* A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
* R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
* ...
* B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
* Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
* X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
* * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
*
*
* Matrices are expected to follow this format. They must have one row an one column * for each defined character (not necessarily in the same order). Each row and column * must start with a distinct character (no repetition) and all row characters must have a * correspondent column, and vice versa.
* *Value at position (i,j) represent the score of substituting character of row i for
* character of column j. Insertion penalties are specified by the last row while deletion
* penalties must be located at the last column (both represented by the special character
* defined by the INDEL_CHAR
constant). Note that it only supports an
* additive gap cost function. In case any of this rules are not followed, an
* {@linkplain InvalidScoringMatrixException} exception is raised by the constructor.
If a scoring operation (substitution, insertion or deletion) involves a character * not found in the matrix, an exception is raised.
* * @author Sergio A. de Carvalho Jr. * @see InvalidScoringMatrixException */ public class ScoringMatrix extends ScoringScheme { /** * The character that indicates the row and column for insertion and deletion * penalties in the matrix. */ protected static final char INDEL_CHAR = '*'; /** * The character used to start a comment line in the scoring matrix file. */ protected static final char COMMENT_CHAR = '#'; /** * Stores matrix column headers in the order they were found. */ protected String col_codes; /** * Stores matrix row headers in the order they were found. */ protected String row_codes; /** * Stores values for each operation (substitution, insertion or deletion) defined by * this matrix. */ protected int matrix[][]; /** * Dimension of the (squared) matrix. */ protected int dimension; /** * The maximum absolute score that this matrix can return for any substitution, * deletion or insertion. */ protected int max_absolute_score; /** * Creates a new instance of a substitution matrix loaded from the character stream. * The case of characters is significant when subsequently computing their score. * * @param input character stream from where the matrix is read * @throws IOException if an I/O operation fails when reading from input * @throws InvalidScoringMatrixException if the matrix does not comply with the * specification */ public ScoringMatrix (Reader input) throws IOException, InvalidScoringMatrixException { this (input, true); } /** * Creates a new instance of a substitution matrix loaded from the character stream. * Ifcase_sensitive
is true
, the case of characters is
* significant when subsequently computing their score; otherwise the case is
* ignored.
*
* @param input character stream from where the matrix is read
* @param case_sensitive true
if the case of characters must be
* @throws IOException if an I/O operation fails when reading from input
* @throws InvalidScoringMatrixException if the matrix does not comply with the
* specification
*/
public ScoringMatrix (Reader input, boolean case_sensitive)
throws IOException, InvalidScoringMatrixException
{
super (case_sensitive);
StreamTokenizer in;
StringBuffer buf = new StringBuffer();
int row, col, max_abs = 0;
char c;
// create a stream tokenizer on top of the input
// stream and set the COMMENT_CHAR as the comment character
in = new StreamTokenizer(input);
in.commentChar(COMMENT_CHAR);
// consider ends of line when reading the first row
in.eolIsSignificant(true);
// skip blank lines (if any)
for (in.nextToken(); in.ttype == StreamTokenizer.TT_EOL; in.nextToken());
// read first row: column character codes
while ((in.ttype != StreamTokenizer.TT_EOF) &&
(in.ttype != StreamTokenizer.TT_EOL))
{
if (in.ttype == StreamTokenizer.TT_WORD)
{
if (in.sval.length() > 1)
throw new InvalidScoringMatrixException
("Column headers must have one-character only.");
buf.append(in.sval.charAt(0));
}
else if (in.ttype == INDEL_CHAR)
{
buf.append(INDEL_CHAR);
}
else
{
throw new InvalidScoringMatrixException("Column headers must be " +
"one-character codes or the special character '" + INDEL_CHAR + "'.");
}
in.nextToken();
}
// convert everything to upper case if it's not case sensitive
if (case_sensitive)
col_codes = buf.toString();
else
col_codes = buf.toString().toUpperCase();
dimension = col_codes.length();
// check if there's a column for deletion penalties
if (col_codes.indexOf (INDEL_CHAR) == -1)
throw new InvalidScoringMatrixException
("Matrix have no column for deletion penalties.");
// check if there is at least one character code (besides the INDEL char)
if (dimension < 2)
throw new InvalidScoringMatrixException
("Matrix must have at least one column with a character code.");
// check for repeated column codes
for (int i = 0; i < dimension; i++)
if (col_codes.indexOf(col_codes.charAt(i),i+1) > i)
throw new InvalidScoringMatrixException
("Columns must have distinct one-character codes.");
// allocate matrix
matrix = new int[dimension][dimension];
// reset buffer
buf.delete (0, dimension);
// from now on, ignore ends of line
in.eolIsSignificant(false);
if (in.ttype == StreamTokenizer.TT_EOL) in.nextToken();
// read rest of matrix (one line for each character, but
// not necessarily in the same order as the columns)
for (row = 0; row < dimension && in.ttype != StreamTokenizer.TT_EOF; row++)
{
// start reading the line: the character code must come first
if (in.ttype == StreamTokenizer.TT_WORD)
{
if (in.sval.length() > 1)
throw new InvalidScoringMatrixException
("Codes must have one character only.");
buf.append(in.sval.charAt(0));
}
else if (in.ttype == INDEL_CHAR)
{
buf.append(INDEL_CHAR);
}
else
{
throw new InvalidScoringMatrixException ("Rows must start with an" +
" one-character code or the special character '" + INDEL_CHAR + "'.");
}
// now, the set of values
for (col = 0; col < dimension; col++)
{
// start reading the values
if (in.nextToken() != StreamTokenizer.TT_NUMBER)
throw new InvalidScoringMatrixException
("Invalid value at row " + (row+1) + ", column " + (col+1) + ".");
matrix[row][col] = (int) in.nval;
if (Math.abs(matrix[row][col]) > max_abs)
max_abs = Math.abs(matrix[row][col]);
}
in.nextToken();
}
// convert everything to upper case if it's not case sensitive
if (case_sensitive)
row_codes = buf.toString();
else
row_codes = buf.toString().toUpperCase();
// check if read as many rows as columns
if (row_codes.length() != dimension)
throw new InvalidScoringMatrixException
("Matrix must have as many rows as columns.");
// check if there's a row for insertion penalties
if (row_codes.indexOf(INDEL_CHAR) == -1)
throw new InvalidScoringMatrixException
("Matrix have no row for insertion penalties.");
// check for repeated row codes
for (int i = 0; i < dimension; i++)
if (row_codes.indexOf(row_codes.charAt(i),i+1) > i)
throw new InvalidScoringMatrixException
("Rows must have distinct one-character codes.");
// check if all rows have a corresponding column
for (int i = 0; i < dimension; i++)
if (col_codes.indexOf(c = row_codes.charAt(i)) == -1)
throw new InvalidScoringMatrixException
("There is no corresponding column for row character '" + c + "'.");
// store the maximum absolute value found
this.max_absolute_score = max_abs;
}
/**
* Returns the score of a substitution of character a
for character
* b
according to this scoring matrix.
*
* @param a first character
* @param b second character
* @return score of a substitution of character a
for b
* @throws IncompatibleScoringSchemeException if this substitution is not defined
*/
public int scoreSubstitution (char a, char b)
throws IncompatibleScoringSchemeException
{
int r,c;
if (case_sensitive)
{
r = row_codes.indexOf(a);
c = col_codes.indexOf(b);
}
else
{
r = row_codes.indexOf(Character.toUpperCase(a));
c = col_codes.indexOf(Character.toUpperCase(b));
}
if (r < 0 || c < 0)
throw new IncompatibleScoringSchemeException ("Substitution of character " +
a + " for " + b + " is not defined.");
return matrix[r][c];
}
/**
* Returns the score of an insertion of character a
according to this
* scoring matrix.
*
* @param a character to be inserted
* @return score of insertion of a
* @throws IncompatibleScoringSchemeException if this character is not recognised
*/
public int scoreInsertion (char a) throws IncompatibleScoringSchemeException
{
return scoreSubstitution (INDEL_CHAR, a);
}
/**
* Returns the score of a deletion of character a
according to this
* scoring matrix.
*
* @param a character to be deleted
* @return score of deletion of a
* @throws IncompatibleScoringSchemeException if this character is not recognised
*/
public int scoreDeletion (char a) throws IncompatibleScoringSchemeException
{
return scoreSubstitution (a, INDEL_CHAR);
}
/**
* Tells whether this scoring scheme supports partial matches, which it does, although
* a particular scoring matrix loaded by this instace might not. A partial match is
* a situation when two characters are not equal but, for any reason, are regarded
* as similar by this scoring scheme, which then returns a positive score value. This
* is common for amino acid scoring matrices.
*
* @return always return true
*/
public boolean isPartialMatchSupported ()
{
return true;
}
/**
* Returns the maximum absolute score that this scoring scheme can return for any
* substitution, deletion or insertion.
*
* @return maximum absolute score that can be returned
*/
public int maxAbsoluteScore ()
{
return max_absolute_score;
}
/**
* Returns a String representation of this scoring matrix.
*
* @return a String representation of this scoring matrix
*/
public String toString ()
{
int row, col;
StringBuffer buf = new StringBuffer();
// column numbers
buf.append("Scoring matrix:\n\t");
for (col = 0; col < dimension; col++)
{
buf.append("\t" + col);
}
buf.append("\n\t");
// column headers
for (col = 0; col < dimension; col++)
{
buf.append('\t');
buf.append(col_codes.charAt(col));
}
// rest of matrix
for (row = 0; row < dimension; row++)
{
// row number and code
buf.append("\n" + row + "\t" + row_codes.charAt(row));
for (col = 0; col < dimension; col++)
{
buf.append('\t');
buf.append(matrix[row][col]);
}
}
return buf.toString();
}
}
neobio-0.0.20030929/src/neobio/alignment/OutMatrix.java 0000644 0002656 0002032 00000012466 07724676562 021534 0 ustar tillea admin /*
* OutMatrix.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package neobio.alignment;
/**
* Implements an interface to the OUT matrix of a block. This class is used by the
* {@linkplain CrochemoreLandauZivUkelson} and subclasses to enconde the OUT matrix
* from the input border and DIST matrix of an {@linkplain AlignmentBlock}.
*
* The OUT matrix defined as OUT[i,j] = I[i] + DIST[i,j]
where I is the
* input border array and DIST is the DIST matrix.
The output border of a block is computed from the OUT matrix by taking the maximum * value of each column. Note that this class does not compute the OUT matrix, it * just stores the necessary information to retrieve a value at any position of the * matrix.
* *It implements the Matrix interface so that the SMAWK algorithm can be used to * compute its column maxima.
* *For more information on how this class is used, please refer to the specification
* of the CrochemoreLandauZivUkelson
and its subclasses.
*
* @author Sergio A. de Carvalho Jr.
* @see CrochemoreLandauZivUkelson
* @see CrochemoreLandauZivUkelsonGlobalAlignment
* @see CrochemoreLandauZivUkelsonLocalAlignment
* @see AlignmentBlock
* @see Smawk
*/
public class OutMatrix implements Matrix
{
/**
* The length of the longest sequence (number of characters) being aligned. It needs
* to be set only once per alignment.
*/
protected int max_length;
/**
* The maximum absolute score that the current scoring scheme can return. It needs
* to be set only once per alignment.
*/
protected int max_score;
/**
* The DIST matrix of a block.
*/
protected int[][] dist;
/**
* The input border of a block.
*/
protected int[] input_border;
/**
* The dimension of the OUT matrix.
*/
protected int dim;
/**
* The number of columns of the block.
*/
protected int lc;
/**
* Initialised this OUT matrix interface. This method needs to be executed only once
* per alignment.
*
* @param max_length the length of the longest sequence (number of characters) being
* aligned
* @param max_score the maximum absolute score that the current scoring scheme can
* return
*/
public void init (int max_length, int max_score)
{
this.max_length = max_length;
this.max_score = max_score;
}
/**
* Sets this interface's data to represent an OUT matrix for a block. This method
* is typically executed once for each block being aligned.
*
* @param dist the DIST matrix
* @param input_border the input border
* @param dim the dimension of the OUT matrix
* @param lc the number of columns of the block
*/
public void setData (int[][] dist, int[] input_border, int dim, int lc)
{
this.dist = dist;
this.input_border = input_border;
this.dim = dim;
this.lc = lc;
}
/**
* Returns the value at a given position of the matrix. In general it returns the
* value of DIST[col][row] + input_border[row]
. However, special cases
* occur for its upper right and lower left triangular parts.
*
* @param row row index
* @param col column index
* @return the value at row row
, column col
of this OUT
* matrix
*/
public int valueAt (int row, int col)
{
// The DIST matrix is indexed by [column][row]
if (col < lc)
{
if (row < dim - (lc - col))
return dist[col][row] + input_border[row];
else
// lower left triangle entries
return - (max_length + row + 1) * max_score;
}
else if (col == lc)
{
return dist[col][row] + input_border[row];
}
else
{
if (row < (col - lc))
// upper right triangle entries
return Integer.MIN_VALUE + row;
else
return dist[col][row - (col - lc)] + input_border[row];
}
}
/**
* Returns the number of rows of this OUT matrix.
*
* @return the number of rows of this OUT matrix
*/
public int numRows ()
{
return dim;
}
/**
* Returns the number of columns of this OUT matrix.
*
* @return the number of columns of this OUT matrix
*/
public int numColumns ()
{
return dim;
}
}
neobio-0.0.20030929/license.txt 0000644 0002656 0002032 00000043131 07627203426 015054 0 ustar tillea admin GNU GENERAL PUBLIC LICENSE
Version 2, June 1991
Copyright (C) 1989, 1991 Free Software Foundation, Inc.
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users. This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it. (Some other Free Software Foundation software is covered by
the GNU Library General Public License instead.) You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.
To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have. You must make sure that they, too, receive or can get the
source code. And you must show them these terms so they know their
rights.
We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.
Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software. If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.
Finally, any free program is threatened constantly by software
patents. We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary. To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.
The precise terms and conditions for copying, distribution and
modification follow.
GNU GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License. The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language. (Hereinafter, translation is included without limitation in
the term "modification".) Each licensee is addressed as "you".
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.
1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.
You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.
2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) You must cause the modified files to carry prominent notices
stating that you changed the files and the date of any change.
b) You must cause any work that you distribute or publish, that in
whole or in part contains or is derived from the Program or any
part thereof, to be licensed as a whole at no charge to all third
parties under the terms of this License.
c) If the modified program normally reads commands interactively
when run, you must cause it, when started running for such
interactive use in the most ordinary way, to print or display an
announcement including an appropriate copyright notice and a
notice that there is no warranty (or else, saying that you provide
a warranty) and that users may redistribute the program under
these conditions, and telling the user how to view a copy of this
License. (Exception: if the Program itself is interactive but
does not normally print such an announcement, your work based on
the Program is not required to print an announcement.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.
In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:
a) Accompany it with the complete corresponding machine-readable
source code, which must be distributed under the terms of Sections
1 and 2 above on a medium customarily used for software interchange; or,
b) Accompany it with a written offer, valid for at least three
years, to give any third party, for a charge no more than your
cost of physically performing source distribution, a complete
machine-readable copy of the corresponding source code, to be
distributed under the terms of Sections 1 and 2 above on a medium
customarily used for software interchange; or,
c) Accompany it with the information you received as to the offer
to distribute corresponding source code. (This alternative is
allowed only for noncommercial distribution and only if you
received the program in object code or executable form with such
an offer, in accord with Subsection b above.)
The source code for a work means the preferred form of the work for
making modifications to it. For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable. However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.
If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.
4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License. Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.
5. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Program or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.
6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.
7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all. For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.
If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded. In such case, this License incorporates
the limitation as if written in the body of this License.
9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation. If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.
10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission. For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this. Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.
NO WARRANTY
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.