|
Server : Apache/2.4.62 System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64 User : www ( 80) PHP Version : 8.3.8 Disable Function : NONE Directory : /domains/markrose/ |
Upload File : |
/*
** MARKOV.C
** (c) 2018 by Mark Rosenfelder
*/
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define TRUE 1
#define FALSE 0
typedef struct wd Word;
typedef struct sec /* Record of second words and their counts */
{
Word *w; // the word
int count; // how many times we found it
struct sec *next;
} Second;
typedef struct wd /* Record of one word found */
{
char *token; // Pointer to the token itself
Second *seconds; // Words that follow it
} Word;
#define MAXTOKEN 40000
int nToken; /* Number of words defined */
Word *tokens[MAXTOKEN]; /* List of words and counts */
Word *lastToken = NULL;
#define isPunc(ch) (strchr("—,.;?!¿¡“”\"",ch))
Word *searchToken( char *token, int create )
/* Find the token in the token list, using a binary search.
*
* If the token is found, return a pointer to it.
*
* If not, add it here. We will know where to put it from where
* the search stopped, and since the list of tokens is linear it's
* a simple matter to update it: we just move everything down one.
*
* Don’t add a new token if create is false.
*
* If there are too many tokens to add another, we just return NULL.
*/
{
int beg, mid, end, diff;
if (nToken == 0)
{
/* Blank list? Easy to see where to add the new entry. */
mid = 0;
}
else
{
/* Search for the entry, using a binary search. */
beg = 0;
end = nToken - 1;
while (beg <= end)
{
mid = (beg + end) / 2;
diff = strcmp( token, tokens[mid]->token );
if (!diff) /* token == mid */
return( tokens[mid] );
else
if (diff > 0) /* token > mid */
beg = mid + 1;
else
end = mid - 1; /* token < mid */
}
/* Once the loop fails, the new entry should be added
* either at mid or at mid + 1, depending on the results
* of the last search.
*/
if (diff > 0)
mid++;
}
if (create)
{
if (nToken < MAXTOKEN)
{
/* Add the new entry. */
memmove( &tokens[mid+1], &tokens[mid],
(nToken - mid) * sizeof(Word) );
Word *w = malloc(sizeof(Word));
if (!w) { printf("Out of memory\n"); exit(1); };
w->seconds = NULL;
w->token = malloc( strlen(token) + 1 );
if (!w->token) { printf("Out of memory\n"); exit(1); };
strcpy( w->token, token );
tokens[mid] = w;
nToken++;
return( w );
}
else
{
printf( "\nTOO MANY WORDS!\n" );
exit(1);
}
}
/* Too many entries, or no creation wanted. Return. */
return(NULL);
} /*searchToken*/
/* Find the word w in the list of seconds.
* If found, increment its count.
* If not, add to the list.
* This is an unsorted list, but we hope it doesn't grow too large.
*/
void incSecondList(Word *first, Word *second)
{
Second *sec;
Second *last = NULL;
/* Linear search for second word */
for (sec = first->seconds; sec; sec = sec->next)
{
if (!strcmp(sec->w->token, second->token))
break;
last = sec;
}
/* If we found it, increment it and go */
if (sec) {
sec->count++;
return;
}
/* Didn't find it. Allocate a new one */
sec = malloc(sizeof(Second));
if (!sec) { printf("Out of memory\n"); exit(1); }
sec->w = second;
sec->count = 1;
sec->next = NULL;
// Link to the new object
if (last)
last->next = sec;
else
first->seconds = sec;
}
/* We have a new token.
* Add it to the list of words in lastToken.
*/
void addToken(char *token)
{
Word *w = searchToken(token, TRUE);
if (lastToken)
{
incSecondList(lastToken, w);
}
lastToken = w;
}
int NextToken( char **sp )
/* Gets the next token in s (if any) and processes it.
* The search begins at *sp. *sp is incremented so this routine
* can be called in a loop.
*
* RETURN VALUE
* Whether there is anything else to process in this line.
*/
{
char ch;
char *s, *token;
/* Skip blanks */
for (s = *sp; *s && isspace(*s); s++)
;
/* Token = part of s before next blank.
* Either get a string of punctuation, or a string of non-punctuation.
*/
if (isPunc(*s)) {
for (token = s; *s && isPunc(*s) && !isspace(*s); s++)
;
} else {
for (token = s; *s && !isPunc(*s) && !isspace(*s); s++)
*s = tolower(*s);
}
*sp = s;
if (s - token)
{
/* Temporarily add a null character at end of token */
ch = *s;
*s = '\0';
/* Process token */
addToken(token);
*s = ch;
return(TRUE);
}
else
return(FALSE);
} /*NextToken*/
// Read the corpus
void ReadFile( char *fn )
{
FILE *f;
char raw[512];
char cooked[1024];
char remainder[512];
char *t;
*remainder = '\0';
f = fopen( fn, "r" );
if (!f)
{
printf( "Can’t open file %s\n", fn );
return;
}
printf( "Reading file %s.\n", fn );
while (fgets( raw, 100, f ))
{
/* Unix/Mac discrepancy with CRLF. Remove both. */
for (t = raw; *t; t++)
if (*t == '\r' || *t == '\n')
*t = ' ';
/* Add remainder from previous line, if any */
strcpy(cooked, remainder);
strcat(cooked, raw);
/* Find the new remainder */
t = cooked + strlen(cooked) - 1;
while (t > cooked && !isspace(*t))
t--;
if (t == cooked)
*remainder = '\0';
else {
strcpy(remainder, t + 1);
*t = '\0';
}
/* Process tokens */
for (t = cooked; NextToken(&t); )
;
}
/* there might be something in remainder */
for (t = remainder; NextToken(&t); )
;
fclose(f);
} /*ReadFile*/
/* Generate one word from this token */
Word *markovNode(Word *w)
{
if (!w) return(NULL);
/* Get the total word count */
int n = 0;
Second *sec;
for (sec = w->seconds; sec; sec = sec->next)
n += sec->count;
/* Pick a random number between 0 and n - 1 */
int tgt = rand() % n;
/* Go through counts again, stopping when we hit tgt */
n = 0;
for (sec = w->seconds; sec; sec = sec->next)
{
n += sec->count;
if (n >= tgt)
return sec->w;
}
return NULL;
}
/* Generate a text */
void markovStart()
{
/* This is a kludge.
* Find the first word that has over 1000 second-words.
*/
int i;
int c;
for (i = 0; i < nToken; i++) {
Second *sec = NULL;
c = 0;
for (sec = tokens[i]->seconds; sec; sec = sec->next)
c++;
if (c > 100) break;
}
if (i >= nToken) { printf( "No start points found\n"); return; }
printf("Initial token should be %s with a count of %i\n", tokens[i]->token, c );
/* Generate a text */
Word *w = tokens[i];
for (i = 0; w && i < 250; i++) {
char ch = w->token[0];
if (!isPunc(ch))
printf(" ");
printf( "%s", w->token );
if (ch == '.' || ch == '?' || ch == '!')
printf("\n");
w = markovNode(w);
}
printf("\n");
}
// Process
int main(int argc, char *argv[])
{
int i;
/* Initializes random number generator */
srand((unsigned) time(NULL));
/* Read the input file(s) */
if (argc == 1)
ReadFile("4901");
else
for (i = 1; i < argc; i++)
ReadFile(argv[i]);
printf("Total tokens in corpus: %i\n", nToken );
/* Debug
for (i = 0; i < nToken; i++) {
printf("%s\n", tokens[i]->token);
Second *sec = NULL;
for (sec = tokens[i]->seconds; sec; sec = sec->next) {
printf(" %s %i\n", sec->w->token, sec->count );
}
}
*/
/* Generate text */
markovStart();
}